<a href="https://colab.research.google.com/github/Tony-labs/ai-eng-nbs-public/blob/master/Lab_%7C_Text_Generation_from_Shakespeare's_Sonnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Lab | Text Generation from Shakespeare's Sonnet

In [None]:
!pip install tensorflow



In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku
import numpy as np

In [None]:
import requests
url = 'https://raw.githubusercontent.com/martin-gorner/tensorflow-rnn-shakespeare/master/shakespeare/sonnets.txt'
resp = requests.get(url)
with open('sonnets.txt', 'wb') as f:
    f.write(resp.content)

data = open('sonnets.txt').read()

corpus = data.lower().split("\n")

In [None]:
# Your code here :
from tensorflow.keras.preprocessing.text import Tokenizer

# Step 1: Initialize a tokenizer
tokenizer = Tokenizer()

# Step 2: Fit the tokenizer on your corpus
tokenizer.fit_on_texts(corpus)

# To see the results:
word_index = tokenizer.word_index
print(f"Number of unique tokens: {len(word_index)}")
print("First 10 words and their indices:", list(word_index.items())[:10])

sequences = tokenizer.texts_to_sequences(corpus)
print("First 3 sequences:", sequences[:3])

Number of unique tokens: 3374
First 10 words and their indices: [('and', 1), ('the', 2), ('to', 3), ('of', 4), ('my', 5), ('i', 6), ('in', 7), ('that', 8), ('thy', 9), ('thou', 10)]
First 3 sequences: [[878], [], []]


In [None]:
# Your code here :
# Assuming you have already fit the tokenizer on the corpus as shown in previous examples

# Step 2: Calculate the Vocabulary Size
total_words = len(tokenizer.word_index) + 1

print(f"The total number of unique words in the corpus is: {total_words}")

The total number of unique words in the corpus is: 3375


In [None]:
input_sequences = []



for line in corpus:

    # Convert each line into a sequence of integers

    token_list = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(token_list)):

        # Generate n-grams for each sequence

        n_gram_sequence = token_list[:i+1]

        input_sequences.append(n_gram_sequence)



# Print first few sequences for verification

print(input_sequences[:5])  # Show the first 5 n-gram sequences

[[3, 2], [3, 2, 313], [3, 2, 313, 1375], [3, 2, 313, 1375, 4], [118, 1376]]


In [None]:
# Your code here :
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assuming input_sequences is already defined and contains your sequences
# For example, let's say you have a list of sequences like this:
# input_sequences = [[1, 2, 3], [4, 5], [6, 7, 8, 9], ...]

# Step 1: Calculate the length of the longest sequence
max_sequence_len = max(len(seq) for seq in input_sequences)

# Step 2: Pad the sequences
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Convert to a numpy array (though `pad_sequences` already returns a numpy array)
input_sequences = np.array(input_sequences)

# Print to verify
print(f"The maximum sequence length is: {max_sequence_len}")
print("Padded sequences:\n", input_sequences)

The maximum sequence length is: 11
Padded sequences:
 [[   0    0    0 ...    0    3    2]
 [   0    0    0 ...    3    2  313]
 [   0    0    0 ...    2  313 1375]
 ...
 [   0    0    0 ...  493  493 3374]
 [   0    0    0 ...  493 3374   14]
 [   0    0    0 ... 3374   14   15]]


In [None]:
# Your code here :
# Assuming input_sequences is a numpy array containing your padded sequences

# Initialize lists to store predictors and labels
predictors = []
labels = []

# Iterate over each sequence
for sequence in input_sequences:
    predictors.append(sequence[:-1])  # All elements except the last one
    labels.append(sequence[-1])       # The last element

# Convert lists to numpy arrays for use in model training
predictors = np.array(predictors)
labels = np.array(labels)

# Print to verify
print("Predictors:\n", predictors)
print("Labels:\n", labels)

Predictors:
 [[   0    0    0 ...    0    0    3]
 [   0    0    0 ...    0    3    2]
 [   0    0    0 ...    3    2  313]
 ...
 [   0    0    0 ... 3373  493  493]
 [   0    0    0 ...  493  493 3374]
 [   0    0    0 ...  493 3374   14]]
Labels:
 [   2  313 1375 ... 3374   14   15]


In [None]:
# Your code here :
from tensorflow.keras.utils import to_categorical  # Import the one-hot encoding function

# Assuming labels is a numpy array of your integer labels and total_words is defined as before

# One-hot encode the labels; num_classes should equal total number of unique words (vocabulary size)
labels = to_categorical(labels, num_classes=total_words)

# Print the shape to verify; it should be (number of sequences, total_words)
print("One-hot encoded labels shape:", labels.shape)


One-hot encoded labels shape: (15484, 3375)


#initialize the model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.regularizers import l2

# Assuming total_words and max_sequence_len are already defined
embedding_dim = 100

# Initialize the model
model = Sequential([
    # Embedding Layer
    Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=max_sequence_len - 1),

    # Bidirectional LSTM Layer
    Bidirectional(LSTM(150, return_sequences=True)),

    # Dropout Layer
    Dropout(0.2), # regularization to minimize overfitting by dropping out 20% of neurons randomly during training

    # LSTM Layer
    LSTM(100),

    # Dense Layer (Intermediate)
    Dense(total_words // 2, activation='relu', kernel_regularizer=l2(0.01)),  # L2 regularization for preventing overfitting

    # Dense Layer (Output)
    Dense(total_words, activation='softmax')
])

# Print the model summary to verify the layers
model.summary()



In [None]:
# Your code here :
# Compile the model
model.compile(loss='categorical_crossentropy',  # Suitable for multi-class classification
              optimizer='adam',                 # Efficient and commonly used optimizer
              metrics=['accuracy'])             # Track accuracy during training

# Verify the compilation
print("Model compiled successfully.")

Model compiled successfully.


In [None]:
model.build (input_shape=(None, max_sequence_len -1)) # Correct input_shape specification when building the model manually
model.summary()

# Train module

250 epoche

In [None]:
# Assuming the model has been defined previously with layers added in the Sequential API.

# Step 1: Compile the model
model.compile(
    loss='categorical_crossentropy',  # Loss function suitable for one-hot encoded labels
    optimizer='adam',                 # Optimizer used for updating model parameters
    metrics=['accuracy']              # Metric for evaluating model performance
)

# Make sure the predictors and labels are properly prepared
# predictors should match the expected input shape (num_samples, max_sequence_len - 1)
# labels should be one-hot encoded, matching the total number of classes (vocabulary size)

# Step 2: Train the model
history = model.fit(
    predictors,
    labels,
    epochs=250,
    batch_size=45,
    validation_split=0.2,  # Optionally use a portion of data for validation
    verbose=1
)

# Optionally, you can print or plot the history to check training progress
import matplotlib.pyplot as plt

# Plotting training accuracy
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history.get('val_accuracy', []), label='val_accuracy')  # Only if validation data is used
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.show()

Epoch 1/250
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 184ms/step - accuracy: 0.0227 - loss: 7.5868 - val_accuracy: 0.0291 - val_loss: 6.7445
Epoch 2/250
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 175ms/step - accuracy: 0.0254 - loss: 6.4755 - val_accuracy: 0.0161 - val_loss: 6.8716
Epoch 3/250
[1m127/276[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m25s[0m 172ms/step - accuracy: 0.0275 - loss: 6.3608