## Read the data

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

my_data = pd.read_csv("/kaggle/input/modern-renaissance-poetry/all.csv")
my_data.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [2]:
my_data.shape

(573, 5)

In [3]:
my_data['type'].unique()

array(['Mythology & Folklore', 'Nature', 'Love'], dtype=object)

In [4]:
# extract love poems as a list
love_poems = my_data.loc[my_data['type'] == 'Love']['content']
love_poems = list(love_poems)

love_poems[0], len(love_poems)

('Why didst thou promise such a beauteous day,\r\nAnd make me travel forth without my cloak,\r\nTo let base clouds oertake me in my way,\r\nHiding thy bravery in their rotten smoke?\r\nTis not enough that through the cloud thou break,\r\nTo dry the rain on my storm-beaten face,\r\nFor no man well of such a salve can speak\r\nThat heals the wound and cures not the disgrace:\r\nNor can thy shame give physic to my grief;\r\nThough thou repent, yet I have still the loss:\r\nThe offenders sorrow lends but weak relief\r\nTo him that bears the strong offences cross.\r\n   Ah! but those tears are pearl which thy love sheds,\r\n   And they are rich and ransom all ill deeds.\r\n \r\n \r\n ',
 326)

I will leave the training data with \n \r. So that the model can learn about line breaks and poem structure.

In [5]:
poems = love_poems
poems[11]

'Joy of my life, full oft for loving you\r\n    I bless my lot, that was so lucky placed:\r\n    But then the more your own mishap I rue,\r\n    That are so much by so mean love embased.\r\nFor had the equal heavens so much you graced\r\n    In this as in the rest, ye might invent\r\n    Some heavenly wit, whose verse could have enchased\r\n    Your glorious name in golden monument.\r\nBut since ye deignd so goodly to relent\r\n    To me your thrall, in whom is little worth,\r\n    That little that I am shall all be spent\r\n    In setting your immortal praises forth;\r\nWhose lofty argument uplifting me\r\n    Shall lift you up unto an high degree.'

## Preprocessing the Data

well suprisingly while removing extra whitespaces, the \n and \r also got erased.

In [6]:
# clean and preprocess the text
def preprocess_text(text):
    # remove extra whitespaces and convert to lowercase
    text = ' '.join(text.split()).lower()
    return text

cleaned_poems = [preprocess_text(poem) for poem in poems]
cleaned_poems[11]

'joy of my life, full oft for loving you i bless my lot, that was so lucky placed: but then the more your own mishap i rue, that are so much by so mean love embased. for had the equal heavens so much you graced in this as in the rest, ye might invent some heavenly wit, whose verse could have enchased your glorious name in golden monument. but since ye deignd so goodly to relent to me your thrall, in whom is little worth, that little that i am shall all be spent in setting your immortal praises forth; whose lofty argument uplifting me shall lift you up unto an high degree.'

In [7]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_poems)
total_words = len(tokenizer.word_index) + 1

total_words

7231

In [8]:
# Create input sequences
input_sequences = []

for poem in cleaned_poems:
    # numerical representation of poem
    token_list = tokenizer.texts_to_sequences([poem])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences[0:10]

[[141, 1000],
 [141, 1000, 28],
 [141, 1000, 28, 865],
 [141, 1000, 28, 865, 63],
 [141, 1000, 28, 865, 63, 9],
 [141, 1000, 28, 865, 63, 9, 676],
 [141, 1000, 28, 865, 63, 9, 676, 112],
 [141, 1000, 28, 865, 63, 9, 676, 112, 1],
 [141, 1000, 28, 865, 63, 9, 676, 112, 1, 76],
 [141, 1000, 28, 865, 63, 9, 676, 112, 1, 76, 15]]

In [9]:
# Pad sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
print("Longest sequence is: ", max_sequence_len)

input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

input_sequences[0:3]

Longest sequence is:  2229


array([[   0,    0,    0, ...,    0,  141, 1000],
       [   0,    0,    0, ...,  141, 1000,   28],
       [   0,    0,    0, ..., 1000,   28,  865]], dtype=int32)

Predictor contains whole sequence except last word. Label would be that last word. Model has to predict this word based on predictor.

Example
```
Sequence = (array([  0,   0,   0, ...,  63,   9, 676], dtype=int32),
Predictor = array([  0,   0,   0, ..., 865,  63,   9], dtype=int32),
Label = 676)
 ```

In [10]:
# Create predictors and label
X, y = input_sequences[:, :-1], input_sequences[:, -1]

print(input_sequences[5], X[5], y[5])

# Convert y to categorical
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

[  0   0   0 ...  63   9 676] [  0   0   0 ... 865  63   9] 676


In [11]:
X[5].shape

(2228,)

## Building the Model

We will try to create a LSTM-based language model using tensorflow and keras.

In [12]:
vocab_size = 7231     # Size of your vocabulary
seq_length = 2228     # Length of your sequences
embedding_dim = 100   # Dimension of the embedding layer

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=seq_length),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))



## Training the Model

In [13]:
history = model.fit(X, y, epochs=50, batch_size=64, validation_split=0.2)

Epoch 1/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 252ms/step - loss: 7.3377 - val_loss: 7.4314
Epoch 2/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 251ms/step - loss: 6.6642 - val_loss: 7.6214
Epoch 3/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 250ms/step - loss: 6.5446 - val_loss: 7.7511
Epoch 4/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 251ms/step - loss: 6.4390 - val_loss: 7.7746
Epoch 5/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 251ms/step - loss: 6.3228 - val_loss: 7.8907
Epoch 6/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 251ms/step - loss: 6.1724 - val_loss: 8.0343
Epoch 7/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 251ms/step - loss: 6.0216 - val_loss: 8.1216
Epoch 8/50
[1m604/604[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 251ms/step - loss: 5.8862 - val_loss: 8.2367
Epoch 9/

## Generating Poems

After training, to generated new poems:

1. Start wtih a seed text.
2. Predicting the next word using the trained model.
3. Appending the predicted word to the seed text.
4. Repeating the process until you reach the desired length.

In [14]:
def generate_poem(model, seed_text, num_words):
    generated_text = seed_text
    
    for _ in range(num_words):
        # Tokenize and pad the input sequence
        sequence = tokenizer.texts_to_sequences([generated_text])
        padded_sequence = pad_sequences(sequence, maxlen=seq_length, padding='pre')
        
        # Predict the next word
        predicted = model.predict(padded_sequence)
        predicted_word_index = np.argmax(predicted)
        
        predicted_word = tokenizer.index_word[predicted_word_index]
        
        # Append the predicted word to the generated text
        generated_text += " " + predicted_word
    
    return generated_text

In [15]:
# Generated a poem
seed_text = "Love is"
generate_poem(model, seed_text, 50)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 262ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6

'Love is a sickness full of woes a race the colour that the size all which the prease of the purest sky for this a wishfull vow of the ground beneath her eyelids she or are times lord the world subdue both that that water with her eyes the fyre of woe'