In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('PoetryFoundationData.csv')
print(df.head())
df_sampled = df.sample(n=100, random_state=42)
poems = df_sampled['Poem'].dropna().tolist()
corpus = "\n".join(poems)

   Unnamed: 0                                              Title  \
0           0  \r\r\n                    Objects Used to Prop...   
1           1  \r\r\n                    The New Church\r\r\n...   
2           2  \r\r\n                    Look for Me\r\r\n   ...   
3           3  \r\r\n                    Wild Life\r\r\n     ...   
4           4  \r\r\n                    Umbrella\r\r\n      ...   

                                                Poem              Poet Tags  
0  \r\r\nDog bone, stapler,\r\r\ncribbage board, ...  Michelle Menting  NaN  
1  \r\r\nThe old cupola glinted above the clouds,...     Lucia Cherciu  NaN  
2  \r\r\nLook for me under the hood\r\r\nof that ...        Ted Kooser  NaN  
3  \r\r\nBehind the silo, the Mother Rabbit\r\r\n...   Grace Cavalieri  NaN  
4  \r\r\nWhen I push your button\r\r\nyou fly off...      Connie Wanek  NaN  


In [3]:
# Data preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.strip()
    return text

corpus = preprocess_text(corpus)

In [4]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1
input_sequences = []

In [5]:
# Creating sequences using a sliding window approach
corpus_words = corpus.split()
for i in range(5, len(corpus_words)):
    seq = corpus_words[i-5:i+1]
    encoded = tokenizer.texts_to_sequences([" ".join(seq)])[0]
    input_sequences.append(encoded)

In [6]:
# Padding sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=6, padding='pre'))

In [7]:
# Splitting into predictors and label
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [8]:
# Building the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=X.shape[1]))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [10]:
# Training the model
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 28ms/step - accuracy: 0.0726 - loss: 6.2770
Epoch 2/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 29ms/step - accuracy: 0.0775 - loss: 6.0826
Epoch 3/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 28ms/step - accuracy: 0.0792 - loss: 5.9582
Epoch 4/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 27ms/step - accuracy: 0.0809 - loss: 5.8351
Epoch 5/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 27ms/step - accuracy: 0.0932 - loss: 5.6538
Epoch 6/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 29ms/step - accuracy: 0.1040 - loss: 5.4800
Epoch 7/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 26ms/step - accuracy: 0.1103 - loss: 5.3182
Epoch 8/10
[1m610/610[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.1267 - loss: 5.1307
Epoch 9/10
[1m610/610[

<keras.src.callbacks.history.History at 0x7d3755a3abc0>

In [11]:
# Text generation function
def generate_poetry(seed_text, next_words=20):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=5, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=1)[0]
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break
    return seed_text

In [12]:
# Generate new lines of poetry
seed_text = "love and hope"
print(generate_poetry(seed_text, next_words=20))

love and hope as you are not not not not not not not not not not not not not not not not not
