Charles Bukowski Poetry Generator

Description: A generative model trained on poetry written by Charles Bukowski, that generates short-form poetry based on keywords from user input.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
import re
import random

In [2]:
with open('bukowski_poems.txt', 'r', encoding='utf-8') as f:
    raw_txt=f.read()

print(f"Read Test:\n\nTotal Characters: {len(raw_txt)}\n\nText from file:\n\n{raw_txt[:500]}")



Read Test:

Total Characters: 48026

Text from file:

a 340 dollar horse and a hundred dollar whore


don’t ever get the idea I am a poet; you can see me

at the racetrack any day half drunk

betting quarters, sidewheelers and straight thoroughs,

but let me tell you, there are some women there

who go where the money goes, and sometimes when you

look at these whores these onehundreddollar whores

you wonder sometimes if nature isn’t playing a joke

dealing out so much breast and ass and the way

it’s all hung together, you look and you look and




In [3]:
clean_txt=raw_txt.lower()
clean_txt=re.sub(r'\d+', '', clean_txt)
print(f"Cleaned Data:\n\n{clean_txt[:500]}")
fully_clean_txt=clean_txt

Cleaned Data:

a  dollar horse and a hundred dollar whore


don’t ever get the idea i am a poet; you can see me

at the racetrack any day half drunk

betting quarters, sidewheelers and straight thoroughs,

but let me tell you, there are some women there

who go where the money goes, and sometimes when you

look at these whores these onehundreddollar whores

you wonder sometimes if nature isn’t playing a joke

dealing out so much breast and ass and the way

it’s all hung together, you look and you look and

you


In [4]:
tokenizer= Tokenizer(num_words=None, oov_token="<unknown>")
tokenizer.fit_on_texts([fully_clean_txt])
word_index=tokenizer.word_index
index_word={index: word for word, index in word_index.items()}

size_of_vocab=len(word_index) +1
print(f"Size of Vocabulary:{size_of_vocab}")
print(list(word_index.items())[:20])


Size of Vocabulary:2008
[('<unknown>', 1), ('the', 2), ('and', 3), ('i', 4), ('a', 5), ('to', 6), ('in', 7), ('of', 8), ('you', 9), ('he', 10), ('that', 11), ('they', 12), ('was', 13), ('it', 14), ('on', 15), ('with', 16), ('said', 17), ('but', 18), ('my', 19), ('not', 20)]


In [5]:
id_sequence=tokenizer.texts_to_sequences([fully_clean_txt])[0]
print(f"Total Tokens: {len(id_sequence)}")
print(id_sequence[:20])

Total Tokens: 9092
[5, 384, 527, 3, 5, 241, 384, 290, 80, 291, 66, 2, 831, 4, 145, 5, 146, 9, 81, 59]


In [6]:
len_of_seq=40
input_seqs=[]
targ_wds=[]

for i in range(len(id_sequence)-len_of_seq):
    seq_input=id_sequence[i: i+len_of_seq]
    targ_wd=id_sequence[i+len_of_seq]
    input_seqs.append(seq_input)
    targ_wds.append(targ_wd)

print(f"Number of pairs created: {len(input_seqs)}")
print(f"Input Sequence Test: {input_seqs[0][:5]}")
print(f"Target Word ID Test: {targ_wds[0]}")
print(f"Target Word Test: {index_word[targ_wds[0]]}")

    
    

Number of pairs created: 9052
Input Sequence Test: [5, 384, 527, 3, 5]
Target Word ID Test: 24
Target Word Test: are


In [7]:
padded_input_seqs=pad_sequences(input_seqs, padding='pre', maxlen=len_of_seq)

In [8]:
from tensorflow.keras.utils import to_categorical
x=np.array(padded_input_seqs)
y=to_categorical(targ_wds, num_classes=size_of_vocab)

x=tf.cast(x, tf.int32)
y=tf.cast(y, tf.float32)

print(f"X Shape: {x.shape} \n Y Shape{y.shape} \nVocab Size: {size_of_vocab}")

X Shape: (9052, 40) 
 Y Shape(9052, 2008) 
Vocab Size: 2008


Model Component

In [9]:
size_of_batch=120
lstm_units=260
epoch_num=60
embedding_dimension=100
drop_rate=0.3

model=Sequential([
    Embedding(input_dim=size_of_vocab, output_dim=embedding_dimension, input_length=len_of_seq),

    LSTM(lstm_units, return_sequences=True),
    
    Dropout(drop_rate),

    
    LSTM(lstm_units),
    
    Dropout(drop_rate),


    Dense(size_of_vocab, activation='softmax')
])



In [10]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=["accuracy"]
)

model.summary()

In [11]:
history=model.fit(x,y, 
                  batch_size=size_of_batch, 
                  epochs=epoch_num, 
                  verbose=1)

Epoch 1/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 2s/step - accuracy: 0.0492 - loss: 6.8759
Epoch 2/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 2s/step - accuracy: 0.0521 - loss: 6.2022
Epoch 3/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 2s/step - accuracy: 0.0544 - loss: 6.0926
Epoch 4/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 2s/step - accuracy: 0.0616 - loss: 5.9783
Epoch 5/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 2s/step - accuracy: 0.0683 - loss: 5.9285
Epoch 6/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 2s/step - accuracy: 0.0747 - loss: 5.8539
Epoch 7/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 2s/step - accuracy: 0.0773 - loss: 5.7559
Epoch 8/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 2s/step - accuracy: 0.0690 - loss: 5.7062
Epoch 9/60
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━

In [12]:

model.save('my_poem_writer_model.keras')
print(f"Save Location: 'my_poem_writer_model.keras'")

Save Location: 'my_poem_writer_model.keras'


In [13]:
from tensorflow.keras.models import load_model
path_to_model='my_poem_writer_model.keras'

try:
    model_var=load_model(path_to_model)
    print(f"Model loaded successfully from {path_to_model}")

except Exception as e:
    print(f"Error encountered loading model: {e}")
except FileNotFoundError:
    print(f"File not found at {path_to_model}")


Model loaded successfully from my_poem_writer_model.keras


In [21]:
def write_poem(model, len_of_seq, tokenizer, seed_txt, words_to_generate):
    write_poem=seed_txt
    current_sequence=seed_txt

    for _ in range(words_to_generate):
        token_list=tokenizer.texts_to_sequences([current_sequence])[0]
        
        padded_token_list=pad_sequences([token_list], padding='pre', maxlen=len_of_seq)
        
        id_prediction=np.argmax(model.predict(padded_token_list, verbose=0))
        
        word_output=tokenizer.index_word.get(id_prediction, '')
        
        if word_output =='':
            print(f"Unknown ID predicted: {id_prediction}\n Poem Generation has been canceled.")
            break
            
        write_poem +=" " +word_output
            
        current_sequence +=" " +word_output

    return write_poem

        

In [23]:
seed_txt= "cold hands"
words_to_generate=100

poem_output=write_poem(model_var, len_of_seq, tokenizer, seed_txt, words_to_generate)

print(f"{poem_output} \n")

NameError: name 'output_word' is not defined