# Poetry Generation with Word2Vec & N-grams

This notebook explores the fusion of Word2Vec embeddings and N-gram analysis to generate English poetry. By combining semantic understanding from Word2Vec with structural patterns from N-grams, we create novel poetic compositions that maintain both meaning and form. The implementation processes English poetry datasets to learn poetic patterns and generate new verses that capture the essence of traditional poetry while introducing innovative word combinations.

# 🎓 Library

In [3]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import random
import nltk
from collections import defaultdict, Counter
import re
import evaluate
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import rouge

# Download necessary NLTK data
nltk.download('punkt')

# Configuration des paramètres
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA = "./data/en_poems.parquet"


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danedebastos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 📚 Loading data into a DataFrame (df)

In [2]:
df = pd.read_parquet(DATA)
df = df.astype({"title": "string", "text": "string", "author": "string"})
df.head()

Unnamed: 0,title,text,author
0,Song for an Unwritten Play.,"The moon's a drowsy fool to-night, Wrapped in ...","Shanks, Edward"
1,The Cup.,As a hot traveller Going through stones and sa...,"Shanks, Edward"
2,A Rhymeless Song.,Rhyme with its jingle still betrays The song t...,"Shanks, Edward"
3,Meadow and Orchard.,"My heart is like a meadow, Where clouds go ove...","Shanks, Edward"
4,Who thinks that he possesses.,Who thinks that he possesses His mistress with...,"Shanks, Edward"


In [3]:
all_text = ' '.join(df['text'].astype(str).tolist())
    

text = re.sub(r'[^\w\s]', ' ', all_text.lower())
text = re.sub(r'\s+', ' ', text).strip()


tokens = word_tokenize(text)

In [4]:
sentences = []
current_sentence = []

for token in tokens:
    current_sentence.append(token)
    if token in ['.', '!', '?', ';'] or len(current_sentence) > 15:
        sentences.append(current_sentence)
        current_sentence = []

if current_sentence:
    sentences.append(current_sentence)

# W2V 

In [44]:

w2v_model = Word2Vec(sentences=sentences, 
                    vector_size=100,
                    window=5,
                    min_count=2,
                    workers=4)

w2v_model.train(sentences, total_examples=len(sentences), epochs=10)




(116809481, 150443030)

# N-GRAM

In [6]:
ngram_models = {}
    
for n in range(2, 5):
    n_grams = list(ngrams(tokens, n))
    
    ngram_model = defaultdict(list)
    
    for gram in n_grams:
        key = tuple(gram[:-1])
        value = gram[-1]
        ngram_model[key].append(value)
    
    for key in ngram_model:
        ngram_model[key] = Counter(ngram_model[key])
    
    ngram_models[n] = ngram_model

# Poem Generator

In [9]:
def generate_poem(w2v_model, ngram_models, seed_words, num_lines=5, line_length=7, creativity=0):
    """
    Generate a poem using Word2Vec and n-gram models
    
    Parameters:
    - w2v_model: Trained Word2Vec model
    - ngram_models: Dictionary of n-gram models
    - seed_words: List of words to start with
    - num_lines: Number of lines in the poem
    - line_length: Approximate number of words per line
    - creativity: 0.0 to 1.0, higher means more Word2Vec influence vs. n-gram
    """
    
    poem = []
    current_line = []
    vocabulary = list(w2v_model.wv.index_to_key)
    
    # Start with a seed word
    current_word = seed_words
    current_line.append(current_word)
    
    for _ in range(num_lines * line_length):
        next_word = None
        
        # Decide whether to use n-gram or Word2Vec based on creativity parameter
        if random.random() > creativity:
            for n in range(min(4, len(current_line) + 1), 1, -1):
                if len(current_line) >= n - 1:
                    key = tuple(current_line[-(n-1):])
                    if key in ngram_models[n]:
                        candidates = ngram_models[n][key]
                        next_word = random.choices(
                            list(candidates.keys()),
                            weights=list(candidates.values()),
                            k=1
                        )[0]
                        break
        
        if next_word is None:
            try:
                similar_words = w2v_model.wv.most_similar(current_word, topn=10)
                next_word = random.choice(similar_words)[0]
            except:
                next_word = random.choice(vocabulary)
        
        current_line.append(next_word)
        
        if len(current_line) >= line_length:
            poem.append(' '.join(current_line))
            current_line = []

            if poem:
                last_line_words = poem[-1].split()
                seed_word = random.choice(last_line_words)
                try:
                    candidates = w2v_model.wv.most_similar(seed_word, topn=5)
                    current_word = random.choice(candidates)[0]
                except:
                    current_word = random.choice(vocabulary)
                current_line.append(current_word)
    
    return '\n'.join(poem)

# Exemple

In [25]:
poem = generate_poem(
        w2v_model=w2v_model,
        ngram_models=ngram_models,
        seed_words="love",
        num_lines=6,
        line_length=6,
        creativity=0  # Balance between structure (n-grams) and creativity (Word2Vec)
    )

print(poem)

love passion of his faith life
hope by dint of hopes hopes
hope and faith restore love pain
that was because o which yet
which miltonic mean itself miltonic whatsoever
self interest spirit itself itself looked
presence image of soul and flesh


In [11]:

model = SentenceTransformer("all-MiniLM-L6-v2")

prompts = [
    "It never ends",
    "The moonlight dances",
    "Darkness falls quickly",
    "Beneath the willow tree",
    "Whispers in the wind",
    "I dreamed of fire",
    "The silence grew louder",
    "Stars fell like rain",
    "Time forgets no one",
    "A rose in winter",
    "Shadows crawl at dawn",
    "My heart is a lantern",
    "Echoes of your name",
    "Frozen in memory",
    "We walked on glass",
    "The sky swallowed the sun",
    "Love fades to smoke",
    "Buried beneath the snow",
    "A storm without sound",
    "Hope wears thin threads"
]

all_poems = df["text"].tolist()

poem_embeddings = model.encode(all_poems, convert_to_tensor=True)

best_refs = []

for prompt in tqdm(prompts):
    prompt_embedding = model.encode(prompt, convert_to_tensor=True)
    similarities = util.cos_sim(prompt_embedding, poem_embeddings)[0]
    best_index = similarities.argmax().item()
    best_poem = all_poems[best_index]
    best_refs.append(best_poem)
    #print(f"\nPrompt: {prompt}\nBest Reference Poem:\n{best_poem}\n{'-'*80}")



100%|██████████| 20/20 [00:01<00:00, 14.95it/s]


In [47]:

# Générer le poème avec le prompt
generated_poem = generate_poem(
    w2v_model, 
    ngram_models, 
    "It never",
    num_lines=10,
    line_length=7,
    creativity=0
)

# Charger les métriques d'évaluation
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# Calculer les scores ROUGE et BERTScore
rouge_score = rouge.compute(predictions=[generated_poem], references=[best_refs[0]])
bert_score = bertscore.compute(predictions=[generated_poem], references=[best_refs[0]], lang="en")

# Afficher les résultats
print("Poème généré:")
print(generated_poem)
print("\nPoème de référence:")
print(best_refs[0])

print("\nROUGE Scores:")
for key, val in rouge_score.items():
    print(f"{key}: {round(val, 4)}")

print("\nBERTScore:")
print("Precision:", round(bert_score["precision"][0], 4))
print("Recall:", round(bert_score["recall"][0], 4))
print("F1:", round(bert_score["f1"][0], 4))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Poème généré:
It never fashon fer the fowl his bewailing
illnesses lwow arsniew robustious branwen and guinivere
strokonoff meknop serge lwow arséniew of lwow
tschitsshakoff and roguenoff blanchefleur blanchefleur snote more
exaltavit humiles and slowly lifting up lira
until fat because till people know what
guess understand insane yes reckon d reckon
think believe when it tells believe to
while from all depths that sink until
melt sink steal melts among the hills
breaks strikes the cymbal rises pours out

Poème de référence:
Friends... old friends...
One sees how it ends.
A woman looks
Or a man tells lies,
And the pleasant brooks
And the quiet skies,
Ruined with brawling
And caterwauling,
Enchant no more
As they did before.
And so it ends
With friends.
Friends... old friends...
And what if it ends?
Shall we dare to shirk
What we live to learn?
It has done its work,
It has served its turn;
And, forgive and forget
Or hanker and fret,
We can be no more
As we were before.
When it ends, it

# Extend df

In [4]:
df1 = pd.read_parquet("../data/en_poems.parquet")
df2 = pd.read_parquet("../data/de_translated_en.parquet")
df = pd.concat([df1, df2], ignore_index=True)
df = df.astype({"title": "string", "text": "string", "author": "string"})
df.dtypes

title       string[python]
text        string[python]
author      string[python]
creation            object
dtype: object

In [5]:
all_text = ' '.join(df['text'].astype(str).tolist())
    
# Basic cleaning
text = re.sub(r'[^\w\s]', ' ', all_text.lower())
text = re.sub(r'\s+', ' ', text).strip()

# Tokenize text
tokens = word_tokenize(text)

In [6]:
sentences = []
current_sentence = []

for token in tokens:
    current_sentence.append(token)
    if token in ['.', '!', '?', ';'] or len(current_sentence) > 15:
        sentences.append(current_sentence)
        current_sentence = []

if current_sentence:  # Add any remaining tokens
    sentences.append(current_sentence)

In [7]:

w2v_model = Word2Vec(sentences=sentences, 
                    vector_size=100,
                    window=5,
                    min_count=2,
                    workers=4)

w2v_model.train(sentences, total_examples=len(sentences), epochs=10)


(161042643, 209310990)

In [8]:
ngram_models = {}
    
for n in range(2, 5):
    # Generate n-grams
    n_grams = list(ngrams(tokens, n))
    
    # Build a model that predicts the next word based on previous n-1 words
    ngram_model = defaultdict(list)
    
    for gram in n_grams:
        key = tuple(gram[:-1])
        value = gram[-1]
        ngram_model[key].append(value)
    
    # Convert lists to frequency distributions
    for key in ngram_model:
        ngram_model[key] = Counter(ngram_model[key])
    
    ngram_models[n] = ngram_model

In [12]:

# Générer le poème avec le prompt
generated_poem = generate_poem(
    w2v_model, 
    ngram_models, 
    "It never",
    num_lines=10,
    line_length=7,
    creativity=0.4
)

 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Poème généré:
It never hienge pendragon olivers olivers follow melodist
caoutchouc or colada colada durindale mundum shellow
seggt de herr eglamour eglamour sir launcelot
nar em dat läpel tweit morgadour sir
denys fogelsang olaf the patrik read loudly
lewis eustace fogelsang and ambrose higham and
hugh tallant pass in valentyne ambrose gazed
hugh de valentyne doth tomas lamorak lewis
may beautiful could seem must i shall
wonderful wonderfully swam back our big goal
richly costly the delicately gifted hermes swings

Poème de référence:
Friends... old friends...
One sees how it ends.
A woman looks
Or a man tells lies,
And the pleasant brooks
And the quiet skies,
Ruined with brawling
And caterwauling,
Enchant no more
As they did before.
And so it ends
With friends.
Friends... old friends...
And what if it ends?
Shall we dare to shirk
What we live to learn?
It has done its work,
It has served its turn;
And, forgive and forget
Or hanker and fret,
We can be no more
As we were before.
When it