### **Rhyme Analysis of Virgil's *The Æneid* in English translation** 
#### Bronwyn Bowles-King
bronwynbowlesking@gmail.com

This code is described in two articles published on Medium. See: https://medium.com/@bronwynbowlesking

### **Part 1**
#### Step 0: Load packages

In [None]:
import re
import string
import csv
import unicodedata
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import cmudict
import ast
from itertools import islice
from wordcloud import WordCloud
import pickle
from collections import Counter

#### Step 1: Define functions to clean and prepare the text file 

1.1 Cleaning function

In [None]:
def clean_line(line):
    # Normalise Unicode (NFKC)
    line = unicodedata.normalize('NFKC', line.strip())
    
    # Replace Æ and æ with ae 
    line = re.sub(r'[Ææ]', 'ae', line, flags=re.IGNORECASE)
    
    # Replace problematic symbols and split hyphenated words into two
    line = re.sub(r'[\u2010-\u2015\u2212]', '-', line)
    line = re.sub(r'(?<=\w)-(?=\w)', ' ', line)
    
    # Remove any remaining non-ASCII characters
    line = line.encode('ascii', 'ignore').decode('ascii')
    
    # Remove punctuation, digits, and special dashes
    line = re.sub(
        fr"[{re.escape(string.punctuation + string.digits)}\u2010-\u2015]", 
        '', 
        line
    )
    
    # Collapse whitespace, lowercase all text, and run a final strip for trailing space
    return re.sub(r'\s+', ' ', line).lower().strip()

1.2 Function to split text into books and store this as metadata for future reference

In [None]:
def preprocess_poem(file_path):
    books = []
    current_book = {'name': '', 'start': 0, 'end': 0}
    cleaned_lines = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f):
            cleaned = clean_line(line)
            original = line.strip()
            
            if original.lower().startswith('book'):
                if current_book['name']:
                    current_book['end'] = len(cleaned_lines) - 1
                    books.append(current_book)
                current_book = {'name': original, 'start': len(cleaned_lines)}
            
            if cleaned:
                cleaned_lines.append(cleaned)
    
    if current_book['name']:
        current_book['end'] = len(cleaned_lines) - 1
        books.append(current_book)
    
    return cleaned_lines, books

1.3 Run the preprocessing functions

In [None]:
file_path = r'your/file/pathway/The_Aeneid.txt'

cleaned_lines, books = preprocess_poem(file_path)

# Save cleaned lines to a new file
output_file_path = r'cleaned_Aeneid.txt'
with open(output_file_path, 'w', encoding='utf-8') as f:
    f.write("\n".join(cleaned_lines))

#### Step 2: Define the functions needed for rhyme analysis


2.1 Define a function to create a pronunciation dictionary

In [None]:
# Load the CMU Pronouncing Dictionary
pronunciation_dict = cmudict.dict()

def get_all_pronunciations(word):
    # Returns list of lists of pronunciations for a word and if a word is not found in the dictionary, it returns an empty list
    return pronunciation_dict.get(word.lower().strip('.,!?;:"\'()-'), [])

2.1.1 Check if the dictionary has loaded and for target words

In [None]:
words = ["mind"]
for w in words:
    print(f"{w}: {pronunciation_dict.get(w.lower())}")

2.2 Extract the target rhyming portion of phonetic representations for comparison

In [None]:
def extract_rhyme_tail(pron):
    vowels = {'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY',
              'IH', 'IY', 'OW', 'OY', 'UH', 'UW'}
    for i in reversed(range(len(pron))):  # Remove stress markers 
        if pron[i][:2] in vowels:
            return pron[i:]
    return pron  

2.3 Detect perfect and near rhymes

In [None]:
# Determine if unique pairs of words rhyme perfectly

def is_rhyme(word1, word2, pron_cache):
    pron1_list = pron_cache.get(word1, [])
    pron2_list = pron_cache.get(word2, [])
    for pron1 in pron1_list:
        tail1 = extract_rhyme_tail(pron1)
        for pron2 in pron2_list:
            tail2 = extract_rhyme_tail(pron2)
            if tail1 == tail2:
                return True
    return False

In [None]:
# Detect near rhymes

def is_near_rhyme(word1, word2, pron_cache):
    pron1_list = pron_cache.get(word1, [])
    pron2_list = pron_cache.get(word2, [])
    
    # Check stress in any pronunciation
    stress_match = any(
        any('1' in phoneme for phoneme in pron1) and 
        any('1' in phoneme for phoneme in pron2)
        for pron1 in pron1_list 
        for pron2 in pron2_list
    )
    
    # Check final 2 phonemes
    phoneme_match = any(
        p1[-2:] == p2[-2:] 
        for p1 in pron1_list 
        for p2 in pron2_list 
        if len(p1)>=2 and len(p2)>=2
    )
    
    return stress_match and phoneme_match

2.4 Extract unique line pairs and construct a rhyme-checking structure to handle the Spenserian stanza

In [None]:
def all_pairs(indices):
    return [(i, j) for idx, i in enumerate(indices) for j in indices[idx+1:]]

spenserian_stanza_length = 9
a_group = [0, 2] # Rhyme groups by line index starting with 0
b_group = [1, 3, 4, 6]
c_group = [5, 7, 8]

# Check for rhymes within stanzas based on rhyme scheme *ababbcbcc*
spenserian_pairs = all_pairs(a_group) + all_pairs(b_group) + all_pairs(c_group)

2.5 Group lines by book name and store as a dictionary

In [None]:
# Read the cleaned text 
with open('cleaned_Aeneid.txt', 'r', encoding='utf-8'): 
    lines = [line.strip() for line in f if line.strip()]

# Group lines by book 
books = {}
current_book = None
for line in lines:
    if line.lower().startswith('book'):
        current_book = line
        books[current_book] = []
    elif current_book:
        books[current_book].append(line)

print(f"Books: {list(books.keys())}")

2.6 View the number of lines per book

In [None]:
# Calculate no. of lines per book
book_names = list(books.keys())
lines_per_book = [len(lines) for lines in books.values()]
average_lines = sum(lines_per_book) / len(lines_per_book)

# Print results. Every result should be divisible by 9 (stanza length)
print("Detailed line counts")
for i, book_name in enumerate(book_names):
    print(f"{book_name}: {lines_per_book[i]} lines") 

print(f"\nAverage lines per book: {average_lines:.2f}")

2.7 Build a global pronunciation cache

In [None]:
global_pron_cache = {}
for book_lines in books.values():
    for line in book_lines:
        if line.strip():
            last_word = line.split()[-1].lower().strip('.,!?;:"\'()-')
            if last_word not in global_pron_cache:
                global_pron_cache[last_word] = get_all_pronunciations(last_word)

2.7.1 Check if the code is working properly and for granular data with these tests 

In [None]:
# Check the first 5 items of global_pron_cache

for k, v in islice(global_pron_cache.items(), 5):
    print(f"{k}: {v}")

In [None]:
# Check how many words have more than one pronunciation to ensure multiple representations loaded

multi_pron_count = sum(1 for prons in global_pron_cache.values() if len(prons) > 1)
print(f"Number of words with more than one pronunciation: {multi_pron_count}")

In [None]:
# Search for target words in the dictionary

word = "strong"  
if word in global_pron_cache:
    print(f"Phonetic representation/s for '{word}': {global_pron_cache[word]}")
else:
    print(f"'{word}' not found in global_pron_cache.")

In [None]:
# Check if any stanzas were excluded from the analysis

incomplete_stanzas = []
for book_name, book_lines in books.items():
    for stanza_start in range(0, len(book_lines), spenserian_stanza_length):
        stanza = book_lines[stanza_start:stanza_start+spenserian_stanza_length]
        if len(stanza) < spenserian_stanza_length:
            incomplete_stanzas.append({
                'book': book_name,
                'stanza_start_line': stanza_start + 1,  
                'lines_in_stanza': len(stanza)
            })

print(f"Incomplete stanzas: {incomplete_stanzas[:10]}")  

2.7.2 Save pronunication cache for later use

In [None]:
# Save a sample of global_pron_cache to inspect the structure

with open("sample_global_pron_cache.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    for k, v in islice(global_pron_cache.items(), 20):
        writer.writerow([k, v])

# Save global pronunciation cache as .pkl file

with open('global_pron_cache.pkl', 'wb') as f:
    pickle.dump(global_pron_cache, f)

# Load .pkl file later

with open('global_pron_cache.pkl', 'rb') as f:
   global_pron_cache = pickle.load(f)

### **Part 2**
#### Step 3: Run the rhyme detection process

In [None]:
results = []
for book_name, book_lines in books.items():
    for stanza_start in range(0, len(book_lines), spenserian_stanza_length):
        stanza = book_lines[stanza_start:stanza_start+spenserian_stanza_length]
        if len(stanza) < spenserian_stanza_length:
            continue  
        
        last_words = [
            line.split()[-1].lower().strip('.,!?;:"\'()-') 
            for line in stanza
        ]
        pron_cache = {word: global_pron_cache.get(word, []) for word in last_words}
        
        for i, j in spenserian_pairs:
            word1 = last_words[i]
            word2 = last_words[j]
            # Check for perfect rhyme
            perfect_rhyme = is_rhyme(word1, word2, pron_cache)
            # Check for near rhyme if not perfect
            near_rhyme = not perfect_rhyme and is_near_rhyme(word1, word2, pron_cache)
            if perfect_rhyme or near_rhyme:
                results.append({
                    'book': book_name,
                    'line1': stanza_start+i+1,
                    'line2': stanza_start+j+1,
                    'word1': word1,
                    'word2': word2,
                    'rhyme_type': 'perfect' if perfect_rhyme else 'near',
                    'text1': stanza[i],
                    'text2': stanza[j]
                })

#  Save and display results 
rhyme_df = pd.DataFrame(results)
print(rhyme_df.head())

rhyme_df.to_csv('cmu_rhyme_analysis.csv', index=False)

#### Step 4: Identify lines missed in the rhyme analysis

In [None]:
existing_lines = set()
for _, row in rhyme_df.iterrows():
    existing_lines.add((row['book'], row['line1']))
    existing_lines.add((row['book'], row['line2']))


missing_lines = []
for book_name, book_lines in books.items():
    for idx, line in enumerate(book_lines, start=1): 
        if (book_name, idx) not in existing_lines:
            missing_lines.append({
                'book': book_name,
                'line_number': idx,
                'text': line
            })

print(f"Missing lines (global_pron): {len(missing_lines)}.")

# Save to CSV
missing_df = pd.DataFrame(missing_lines)
missing_df.to_csv('missing_lines.csv', index=False)

#### Step 5: Check which words have no pronunciation available

In [None]:
empty = [k for k, v in global_pron_cache.items() if not v]
print(f"Words with no pronunciation (global_pron_cache): {len(empty)}")
print("First five words with no pronunciation (global_pron_cache):", empty[:5])

with open('words_no_pronunciation.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['word'])
    for word in empty:
        writer.writerow([word])

#### Step 6: Create a list of phonetic representations for missing words

In [None]:
# Load custom pronunciations CSV
custom_pron_path = 'your/file/pathway/missing_phonetics.csv'
custom_pron = {}

with open(custom_pron_path, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        word = row['word'].strip().lower()
        try:
            pron = ast.literal_eval(row['phonemes'])  
        except Exception:
            pron = []
        custom_pron[word] = pron

# Copy, rename and update cache with custom pronunciations
pron_cache_all = global_pron_cache.copy()
pron_cache_all.update(custom_pron)

6.1 Check if the join between global_pron_cache and custom_pron was successful by searching for a target word and showing words with no pronunciations

In [None]:
word = "oer"  
if word in pron_cache_all:
    print(f"Phonetic representation/s for '{word}': {pron_cache_all[word]}.")
else:
    print(f"'{word}' not found in pron_cache_all.")

In [None]:
empty = [k for k, v in pron_cache_all.items() if not v]
print(f"Words with no pronunciation (pron_cache_all): {len(empty)}")
print("First five words with no pronunciation (pron_cache_all):", empty[:5])

with open('words_no_pronunciation.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['word'])
    for word in empty:
        writer.writerow([word])

#### Step 7: Re-run the rhyme analysis and check for missing lines again

In [None]:
results = []
for book_name, book_lines in books.items():
    for stanza_start in range(0, len(book_lines), spenserian_stanza_length):
        stanza = book_lines[stanza_start:stanza_start+spenserian_stanza_length]
        if len(stanza) < spenserian_stanza_length:
            continue  
        
        last_words = [
            line.split()[-1].lower().strip('.,!?;:"\'()-') 
            for line in stanza
        ]
        pron_cache = {word: pron_cache_all.get(word, []) for word in last_words}
        
        for i, j in spenserian_pairs:
            word1 = last_words[i]
            word2 = last_words[j]
            perfect_rhyme = is_rhyme(word1, word2, pron_cache)
            near_rhyme = not perfect_rhyme and is_near_rhyme(word1, word2, pron_cache)
            if perfect_rhyme or near_rhyme:
                results.append({
                    'book': book_name,
                    'line1': stanza_start+i+1,
                    'line2': stanza_start+j+1,
                    'word1': word1,
                    'word2': word2,
                    'rhyme_type': 'perfect' if perfect_rhyme else 'near',
                    'text1': stanza[i],
                    'text2': stanza[j]
                })

#  Save and display results 
rhyme_df = pd.DataFrame(results)
print(rhyme_df.head())
rhyme_df.to_csv('custom_rhyme_analysis.csv', index=False)

In [None]:
existing_lines = set()
for _, row in rhyme_df.iterrows():
    existing_lines.add((row['book'], row['line1']))
    existing_lines.add((row['book'], row['line2']))


missing_lines = []
for book_name, book_lines in books.items():
    for idx, line in enumerate(book_lines, start=1): 
        if (book_name, idx) not in existing_lines:
            missing_lines.append({
                'book': book_name,
                'line_number': idx,
                'text': line
            })

print(f"Missing lines (pron_cache_all): {len(missing_lines)}.")

# Save to CSV
missing_df = pd.DataFrame(missing_lines)
missing_df.to_csv('missing_lines2.csv', index=False)

7.1 Save new pronunication cache for later use

In [None]:
# Save a sample of pron_cache_all to inspect the structure
with open("sample_pron_cache_all.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    for k, v in islice(pron_cache_all.items(), 20):
        writer.writerow([k, v])

# Save pron_cache_all as .pkl file
with open('pron_cache_all.pkl', 'wb') as f:
     pickle.dump(pron_cache_all, f)

# Load .pkl file if needed later
with open('pron_cache_all.pkl', 'rb') as f:
    pron_cache_all = pickle.load(f)

#### Step 8: Calculate the consistency of rhyme detection 

In [None]:
print(f"Number of rhyme pairs found: {len(results)}")

#### Step 9: Create and save a list of final undetected words

In [None]:
# Collect all last words from all lines
all_last_words = set()
for book_lines in books.values():
    for line in book_lines:
        if line.strip():
            last_word = line.split()[-1].lower().strip('.,!?;:"\'()-')
            all_last_words.add(last_word)

# Collect all words that appear in any rhyme pair
rhymed_words = set(rhyme_df['word1']).union(set(rhyme_df['word2']))

# Find words that never rhyme
unrhymed_words = sorted(all_last_words - rhymed_words)

print(f"Number of last words not detected as rhyming: {len(unrhymed_words)}")

# Save to CSV
unrhymed_path = 'final_missing_words.csv'
with open(unrhymed_path, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['word'])
    for word in unrhymed_words:
        writer.writerow([word])

#### Step 10: Visualisation

In [None]:
# Ensure rhyme_df is defined
if 'rhyme_df' not in globals():
    rhyme_df = pd.DataFrame(results)

# Count frequency of rhymed words by type
perfect_counts = {}
near_counts = {}

for _, row in rhyme_df.iterrows():
    if row['rhyme_type'] == 'perfect':
        perfect_counts[row['word1']] = perfect_counts.get(row['word1'], 0) + 1
        perfect_counts[row['word2']] = perfect_counts.get(row['word2'], 0) + 1
    elif row['rhyme_type'] == 'near':
        near_counts[row['word1']] = near_counts.get(row['word1'], 0) + 1
        near_counts[row['word2']] = near_counts.get(row['word2'], 0) + 1

# Generate word clouds
wc_perfect = WordCloud(width=800, height=400, max_words=40, background_color='white', colormap='viridis', font_path='C:/Windows/Fonts/GARABD.ttf').generate_from_frequencies(perfect_counts) # GARABD is Garamond bold from Windows default fonts
wc_near = WordCloud(width=800, height=400, max_words=40, background_color='white', colormap='plasma', font_path='C:/Windows/Fonts/GARABD.ttf').generate_from_frequencies(near_counts)

# Plot side by side
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plt.imshow(wc_perfect, interpolation='bilinear')
plt.axis('off')
plt.title('Perfect Rhymes\n')

plt.subplot(1, 2, 2)
plt.imshow(wc_near, interpolation='bilinear')
plt.axis('off')
plt.title('Near Rhymes\n')

plt.show()

#### Step 11: Provide a list of the most frequent end-rhyming words 

In [None]:
# Collect all last words from all lines in all books
all_last_words = []
for book_lines in books.values():
    for line in book_lines:
        if line.strip():
            last_word = line.split()[-1].lower().strip('.,!?;:"\'()-')
            all_last_words.append(last_word)

# Count frequency of each last word
last_word_counts = Counter(all_last_words)

# Sort by frequency (descending)
sorted_last_words = last_word_counts.most_common()

# Print top 10 most frequent last words
print("Top 10 most frequent last words:")
for word, count in sorted_last_words[:10]:
    print(f"{word}: {count}")

# Save to CSV
freq_path = 'last_word_frequencies.csv'
with open(freq_path, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['word', 'frequency'])
    for word, count in sorted_last_words:
        writer.writerow([word, count])