This notebook focuses on the analysis and processing of the text "Poems Teachers Ask For". The text is first preprocessed and segmented into individual poems, which are then stored in a pandas DataFrame for analysis.
Each poem is analyzed using term frequency (TF) and inverse document frequency (IDF). These are combined to create a TF-IDF vector representation for each poem.

PROCESSING TITLES AND AUTHOR

In [16]:
# %pip install pandas
# %pip install numpy

import pandas as pd
import numpy as np
import re

# reads the txt file
with open("poems_teachers_ask_for.txt", "r") as collection:
    text = collection.read()

# extracts the table of contents for the poem title and poet name
toc_start_indicator = "INDEX"
toc_end_indicator = "PREFACE"
toc_start_index = text.find(toc_start_indicator)
toc_end_index = text.find(toc_end_indicator)

toc = text[toc_start_index: toc_end_index]

lines = toc.splitlines()
basic_info = []
for line in lines:
    res_entries = []
    # splits entries by author name
    entry = line.split("_")
    # splits by multiple spaces is author name is unknown
    if "_" not in line:
        entry = line.split("    ")
    for description in entry:
        # removes additional spaces
        clean_description = description.strip()
        if clean_description != "":
            res_entries.append(clean_description)
    #stories list of title, author, page in a list of entry lists
    basic_info.append(res_entries)
    
# removes "INDEX" specifier
del basic_info[0]
# removes any empty list [] in basic_info
basic_info = [entry for entry in basic_info if (len(entry) > 0) & (entry != [])]
for entry in basic_info:
    # removes the last element of the entries (page number)
    entry.pop()

print(basic_info)

# basic_info now contains lists of [title] or [title, author]

[['Abou Ben Adhem', 'Hunt'], ['Abraham Lincoln', 'T. Taylor'], ['All Things Bright and Beautiful', 'Alexander'], ['American Flag, The', 'Drake'], ['Answer to "Rock Me to Sleep"'], ['Arrow and the Song, The', 'Longfellow'], ['Asleep at the Switch', 'Hoey'], ['At School-Close', 'Whittier'], ['Aunt Tabitha'], ['Autumn Woods', 'Bryant'], ['Baby, The', 'Macdonald'], ['Barbara Frietchie', 'Whittier'], ['Barefoot Boy, The', 'Whittier'], ['Bay Billy', 'Gassaway'], ['Be Strong', 'Babcock'], ['Better Than Gold', 'Smart'], ['Bingen on the Rhine', 'Norton'], ['Blue and the Gray, The', 'Finch'], ["Bluebird's Song, The", 'E.H. Miller'], ['Bobby Shaftoe'], ['Boy and His Stomach, A'], ["Boy's Song, A", 'Hogg'], ['"Breathes There the Man"', 'Scott'], ['Brier-Rose', 'Boyesen'], ['Brook, The', 'Tennyson'], ['Brown Thrush, The', 'Larcom'], ['Bugle Song, The', 'Tennyson'], ['Builders, The', 'Longfellow'], ['Building of the Ship, The', 'Longfellow'], ['Burial of Sir John Moore, The', 'Wolfe'], ['Calf Path, 

NORMALIZING TITLES AND AUTHORS

In [17]:
#print(len(basic_info), len(cleaned_poems)) # 240 237

# normalizing helper function that removes punctuation and lowercasing
def normalize(text):
  text = text.lower()
  text = re.sub(r'[^a-z0-9\s]', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text

# populate unknown titles/authors with default descriptors
normalized_titles = []
titles = []
authors = []

for entry in basic_info:
  # both title and author exists
  if len(entry) == 2:
      title, author = entry
  # only title exists
  elif len(entry) == 1:
     title = entry[0]
     author = "Unknown"
  # both title and author doesn't exist (should not occur)
  else:
     title = "Untitled"
     author = "Unknown"

  # fix titles (e.g. "American Flag, The" -> "The American Flag")
  # finds titles with ending article
  match = re.match(r'(.+),\s*(The|A|An)$', title)
  if match:
    # swaps the main title and article
    main, article = match.groups()
    title = f"{article} {main}"

  # appends to lists
  normalized_titles.append(normalize(title))
  titles.append(title)
  authors.append(author)

print(normalized_titles)
print(authors)
      


['abou ben adhem', 'abraham lincoln', 'all things bright and beautiful', 'the american flag', 'answer to rock me to sleep ', 'the arrow and the song', 'asleep at the switch', 'at school close', 'aunt tabitha', 'autumn woods', 'the baby', 'barbara frietchie', 'the barefoot boy', 'bay billy', 'be strong', 'better than gold', 'bingen on the rhine', 'the blue and the gray', 'the bluebird s song', 'bobby shaftoe', 'a boy and his stomach', 'a boy s song', ' breathes there the man ', 'brier rose', 'the brook', 'the brown thrush', 'the bugle song', 'the builders', 'the building of the ship', 'the burial of sir john moore', 'the calf path', 'casey at the bat', 'casey s revenge', 'the chambered nautilus', 'character of the happy warrior', 'the charge of the light brigade', 'the children s hour', 'the children', 'a child s thought of god', 'christ in flanders', 'christmas everywhere', 'the cloud', 'college oil cans', 'columbus', 'the concord hymn', 'the corn song', 'crossing the bar', 'curfew mus

PROCESSING MAIN CONTENT (POEMS)

In [18]:
# extracts the main content: the collection of poems
main_start_indicator = "O Captain! My Captain!"
main_end_indicator = "*** END OF THE PROJECT GUTENBERG EBOOK"
main_start_index = text.find(main_start_indicator)
main_end_index = text.find(main_end_indicator)

# the context of the collection: all of the poems
text_content = text[main_start_index:main_end_index]

In [19]:
# Ensure the running kernel looks in the project nltk_data directory
# import os
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import copy

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_poems = []
lemma_poems = []

# every poem is separated by 4 or more new lines in a row
poems = re.split(r'\n{4,}', text_content)
# a to-be-cleaned version of the list of individual opens
cleaned_poems = copy.deepcopy(poems)

for i in range(len(cleaned_poems)):
  # remove the multiple new lines in a row
  cleaned_poems[i] = re.sub(r'\n{2,}', ' ', cleaned_poems[i])
  # replaces non-alphanumeric with space
  cleaned_poems[i] = re.sub(r'\W+', ' ', cleaned_poems[i])
  # removes underscores
  cleaned_poems[i] = cleaned_poems[i].replace('_', '')
  # replaces multiple spaces with a single space
  cleaned_poems[i] = re.sub(r'\s+', ' ', cleaned_poems[i])
  # separate each word to create a list // tokenization
  cleaned_poems[i] = cleaned_poems[i].split()

  # change all words to lowercase
  for j in range(len(cleaned_poems[i])):
    cleaned_poems[i][j] = cleaned_poems[i][j].lower()

  # remove empty entries in cleaned_poems

  cleaned_poems[i] = [word for word in cleaned_poems[i] if (len(word) > 0) & (word != [])]

  # remove stopwords with nltk
  cleaned_poems[i] = [word for word in cleaned_poems[i] if word not in stop_words]

  # stemming using nltk and storing in stemmed_poems list
  stemmed = [stemmer.stem(word) for word in cleaned_poems[i]]
  stemmed_poems.append(stemmed)

  # lemmatizing using nltk and storing in lemma_poems list
  lemma = [lemmatizer.lemmatize(word) for word in cleaned_poems[i]]
  lemma_poems.append(lemma)


# cleaned_poems now contained tokenized and lowercased words
# stemmed_poems is cleamed_poems and stemmed
# lemma_poems is cleaned_poems and lemmatized

print(cleaned_poems[0])
print(stemmed_poems[0])
print(lemma_poems[0])
print(poems)



['captain', 'captain', 'poem', 'written', 'memory', 'abraham', 'lincoln', 'captain', 'captain', 'fearful', 'trip', 'done', 'ship', 'weathered', 'every', 'rack', 'prize', 'sought', 'port', 'near', 'bells', 'hear', 'people', 'exulting', 'follow', 'eyes', 'steady', 'keel', 'vessel', 'grim', 'daring', 'heart', 'heart', 'heart', 'bleeding', 'drops', 'red', 'deck', 'captain', 'lies', 'fallen', 'cold', 'dead', 'captain', 'captain', 'rise', 'hear', 'bells', 'rise', 'flag', 'flung', 'bugle', 'trills', 'bouquets', 'ribboned', 'wreaths', 'shores', 'crowding', 'call', 'swaying', 'mass', 'eager', 'faces', 'turning', 'captain', 'dear', 'father', 'arm', 'beneath', 'head', 'dream', 'deck', 'fallen', 'cold', 'dead', 'captain', 'answer', 'lips', 'pale', 'still', 'father', 'feel', 'arm', 'pulse', 'ship', 'anchored', 'safe', 'sound', 'voyage', 'closed', 'done', 'fearful', 'trip', 'victor', 'ship', 'comes', 'object', 'exult', 'shores', 'ring', 'bells', 'mournful', 'tread', 'walk', 'deck', 'captain', 'lies'

MATCHING TITLES TO POEMS

In [20]:
# Matching titles to poems

# poems not split by words; lowercased and special characters removed
normalized_poems = [normalize(''.join(poem)) for poem in poems]
print(normalized_poems)

# match poems to the title by searching for title in each poem
matched_poems = []
matched_indices = []

for title in normalized_titles:
  matched = False
  for id, poem in enumerate(normalized_poems):
    # checks if poem is already matched (using id)
    if id in matched_indices:
      continue
    if title in poem:
      matched_poems.append((title, id))
      matched_indices.append(id)
      matched = True
      break
  if not matched:
      matched_poems.append((title, None)) 
  
print(matched_poems)

[('abou ben adhem', 25), ('abraham lincoln', 0), ('all things bright and beautiful', 44), ('the american flag', 146), ('answer to rock me to sleep ', 123), ('the arrow and the song', 90), ('asleep at the switch', 64), ('at school close', 76), ('aunt tabitha', 49), ('autumn woods', 56), ('the baby', 6), ('barbara frietchie', 71), ('the barefoot boy', 191), ('bay billy', 124), ('be strong', 148), ('better than gold', 157), ('bingen on the rhine', 137), ('the blue and the gray', 209), ('the bluebird s song', 88), ('bobby shaftoe', 3), ('a boy and his stomach', 110), ('a boy s song', 181), (' breathes there the man ', 212), ('brier rose', 159), ('the brook', 24), ('the brown thrush', 203), ('the bugle song', 207), ('the builders', 202), ('the building of the ship', 73), ('the burial of sir john moore', 221), ('the calf path', 128), ('casey at the bat', 120), ('casey s revenge', 121), ('the chambered nautilus', 174), ('character of the happy warrior', 170), ('the charge of the light brigade

CREATING DATAFRAME (TITLE, AUTHOR, POEM, TOKENS (IN DIFFERENT PROCESSED FORMS))

In [21]:
# combine normalized titles, author, and poems (as list of words) into pandas data frame
data = []

for i, title in enumerate(titles):
  author = authors[i]

  # grabs index of poem in poems lists corresponding to title
  matched_index = matched_poems[i][1]

  if matched_index is not None:
    original_poem = poems[matched_index]
    cleaned = cleaned_poems[matched_index]
    stemmed = stemmed_poems[matched_index]
    lemmatized = lemma_poems[matched_index]

  data.append({
      "Title": title,
      "Author": author,
      "Original_Poem": original_poem,
      "Cleaned_Tokens": cleaned,
      "Stemmed_Tokens": stemmed,
      "Lemmatized_Tokens": lemmatized
  })

# create dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Title,Author,Original_Poem,Cleaned_Tokens,Stemmed_Tokens,Lemmatized_Tokens
0,Abou Ben Adhem,Hunt,Abou Ben Adhem\n\n\nAbou Ben Adhem--may his tr...,"[abou, ben, adhem, abou, ben, adhem, may, trib...","[abou, ben, adhem, abou, ben, adhem, may, trib...","[abou, ben, adhem, abou, ben, adhem, may, trib..."
1,Abraham Lincoln,T. Taylor,O Captain! My Captain!\n\n(_This poem was writ...,"[captain, captain, poem, written, memory, abra...","[captain, captain, poem, written, memori, abra...","[captain, captain, poem, written, memory, abra..."
2,All Things Bright and Beautiful,Alexander,All Things Bright and Beautiful\n\n\nAll thing...,"[things, bright, beautiful, things, bright, be...","[thing, bright, beauti, thing, bright, beauti,...","[thing, bright, beautiful, thing, bright, beau..."
3,The American Flag,Drake,The American Flag\n\n\nWhen Freedom from her m...,"[american, flag, freedom, mountain, height, un...","[american, flag, freedom, mountain, height, un...","[american, flag, freedom, mountain, height, un..."
4,"Answer to ""Rock Me to Sleep""",Unknown,"An Answer to ""Rock Me to Sleep""\n\n\nMy child,...","[answer, rock, sleep, child, ah, child, thou, ...","[answer, rock, sleep, child, ah, child, thou, ...","[answer, rock, sleep, child, ah, child, thou, ..."


COMPUTING TF-IDF

In [22]:
# setting up tfidf dataframe

# rows with id, title of the poem, and token
rows = []

for id, row in df.iterrows():
  # obtains information from the df
  tokens = row["Stemmed_Tokens"]
  title = row["Title"]
  total = len(tokens)
  for token in tokens:
    rows.append({"id": id, 'title': title, 'total_tokens': total, 'token': token})

# creates a data frame with all of the tokens
tokens_df = pd.DataFrame(rows)

# group by id and token
grouped = tokens_df.groupby(['id', 'token'])

# counts each token and converts to dataframe
tfidf_df = grouped.size().to_frame(name='count')
print(tfidf_df)
# merge data frames (left) for total_tokens and title
tfidf_df = tfidf_df.merge(tokens_df, on=['id', 'token'], how='left')

tfidf_df.head()



            count
id  token        
0   abou        4
    accord      1
    adhem       4
    angel       3
    answer      1
...           ...
239 weep        1
    within      2
    work        2
    wound       1
    yet         1

[34085 rows x 1 columns]


Unnamed: 0,id,token,count,title,total_tokens
0,0,abou,4,Abou Ben Adhem,92
1,0,abou,4,Abou Ben Adhem,92
2,0,abou,4,Abou Ben Adhem,92
3,0,abou,4,Abou Ben Adhem,92
4,0,accord,1,Abou Ben Adhem,92


In [23]:
#
# computing term frequency (tf)
#
tfidf_df["tf"] = tfidf_df['count'] / tfidf_df['total_tokens']

#
# computing inverse term frequency (idf)
#
total_poems = df.shape[0] 

# computes document frequency
# group by token and determines number of unique documents/ids
doc_f = tfidf_df.groupby('token')['id'].nunique() 
# computes inverse document frequency of each token
idf = np.log(total_poems / doc_f)

# maps idf to tfidf dataframe using token
tfidf_df['idf'] = tfidf_df['token'].map(idf)

#
# computing tf-idf
#
tfidf_df['tfidf'] = tfidf_df['tf'] * tfidf_df['idf']

tfidf_df.tail()

Unnamed: 0,id,token,count,title,total_tokens,tf,idf,tfidf
48263,239,within,2,Your Mission,113,0.017699,2.148434,0.038025
48264,239,work,2,Your Mission,113,0.017699,1.609438,0.028486
48265,239,work,2,Your Mission,113,0.017699,1.609438,0.028486
48266,239,wound,1,Your Mission,113,0.00885,3.283414,0.029057
48267,239,yet,1,Your Mission,113,0.00885,1.386294,0.012268


SEARCH ENGINE FUNCTIONALITY

COMPUTING TF-IDF OF SEARCH QUERY

In [24]:
query = "death and life and war"
# removes non-alphanumeric characters
query = re.sub(r'\W+', ' ', query)
# replaces underscores
query = query.replace('_', '')
# reduces whitespaces to one space
query = re.sub(r'\s+', ' ', query)
# lowercase and splits words
query_tokens = query.lower().split()

# calculates the tf-idf of the query
# computes terms with their corresponding counts
query_tf = []
query_tf_tokens = []
for token in query_tokens:
  if token not in query_tf_tokens:
    query_tf_tokens.append(token)
    query_tf.append(1)
  else:
    index = query_tf_tokens.index(token)
    query_tf[index] += 1
# creates dataframe for query tfidf
query_tfidf = pd.DataFrame(query_tf, query_tf_tokens, columns=["count"])

# computes query term frequency
query_tfidf["tf"] = query_tfidf['count'] / sum(query_tfidf['count'])
# finds corresponding idf for the token (default to 0 if not found)
query_tfidf['idf'] = [idf.get(token, 0) for token in query_tf_tokens]

# computes tfidf
query_tfidf['tfidf'] = query_tfidf['tf'] * query_tfidf['idf']
display(query_tfidf)

Unnamed: 0,count,tf,idf,tfidf
death,1,0.2,1.630491,0.326098
and,2,0.4,0.0,0.0
life,1,0.2,1.275946,0.255189
war,1,0.2,2.345145,0.469029


IMPLEMENTING COSINE SIMILARITY BETWEEN TF-IDF IN QUERY AND POEMS

In [25]:
# helper function to calculate cosine similarity
def cos_similarity(a, b):
  return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

# implementing cosine similarity between tf-idf in query and every poems
similarity_scores = []
for poem_id in df.index:
  found_in_poem = []
  for token in query_tf_tokens:
    # finds the row in tfidf_df where query token appears in given poem
    row = tfidf_df[(tfidf_df['id'] == poem_id) & (tfidf_df['token'] == token)]
    if row.empty:
      found_in_poem.append(0)
    else:
      # access the tfidf of the token in the poem 
      found_in_poem.append(row.iloc[0]['tfidf'])

  # accounts for division by zero
  if np.linalg.norm(found_in_poem) == 0 or np.linalg.norm(query_tfidf['tfidf'].values) == 0:
    similarity = 0.0
  else:
    similarity = cos_similarity(found_in_poem, query_tfidf['tfidf'].values)

  similarity_scores.append({
    "id": poem_id,
    "title": df.loc[poem_id, "Title"],
    "author": df.loc[poem_id, "Author"],
    "similarity": similarity
  })

print(similarity_scores)

# sort similarity scores by obtaining similarity of each entry in similarity_scores
similarity_scores = sorted(similarity_scores, key= lambda x: x['similarity'], reverse=True)
for result in similarity_scores[:10]:
    print(f"title: {result['title']}, author: {result['author']}, similarity: {result['similarity']}")

[{'id': 0, 'title': 'Abou Ben Adhem', 'author': 'Hunt', 'similarity': 0.0}, {'id': 1, 'title': 'Abraham Lincoln', 'author': 'T. Taylor', 'similarity': 0.0}, {'id': 2, 'title': 'All Things Bright and Beautiful', 'author': 'Alexander', 'similarity': 0.0}, {'id': 3, 'title': 'The American Flag', 'author': 'Drake', 'similarity': np.float64(0.9130389202672862)}, {'id': 4, 'title': 'Answer to "Rock Me to Sleep"', 'author': 'Unknown', 'similarity': np.float64(0.40787244339026907)}, {'id': 5, 'title': 'The Arrow and the Song', 'author': 'Longfellow', 'similarity': 0.0}, {'id': 6, 'title': 'Asleep at the Switch', 'author': 'Hoey', 'similarity': np.float64(0.40787244339026907)}, {'id': 7, 'title': 'At School-Close', 'author': 'Whittier', 'similarity': 0.0}, {'id': 8, 'title': 'Aunt Tabitha', 'author': 'Unknown', 'similarity': 0.0}, {'id': 9, 'title': 'Autumn Woods', 'author': 'Bryant', 'similarity': np.float64(0.40787244339026907)}, {'id': 10, 'title': 'The Baby', 'author': 'Macdonald', 'similar

ADDITIONAL FEATURE...

K-MEANS CLUSTERING FOR THEMATIC ORGANIZATION

In [26]:
# new tfidf with lemmatized tokens for clustering

# setting up clustering_tfidf dataframe
# rows with id, title of the poem, and token
ctfidf_rows = []

for id, row in df.iterrows():
  # obtains information from the df
  l_tokens = row['Lemmatized_Tokens']
  title = row["Title"]
  total = len(l_tokens)
  for token in l_tokens:
    ctfidf_rows.append({"id": id, 'title': title, 'total_tokens': total, 'lem_token': token})

# creates a data frame with all of the tokens
ctfidf_df = pd.DataFrame(ctfidf_rows)

# group by id and token
cgrouped = ctfidf_df.groupby(['id', 'lem_token'])

# counts each token and converts to dataframe
ctfidf_df = cgrouped.size().to_frame(name='count')
print(ctfidf_df)
# merge data frames (left) for total_tokens and title
ctfidf_df = ctfidf_df.merge(ctfidf_rows_df := pd.DataFrame(ctfidf_rows), on=['id', 'lem_token'], how='left')

ctfidf_df.head()



               count
id  lem_token       
0   abou           4
    accord         1
    adhem          4
    angel          3
    answered       1
...              ...
239 weep           1
    within         2
    work           2
    wounded        1
    yet            1

[34856 rows x 1 columns]


Unnamed: 0,id,lem_token,count,title,total_tokens
0,0,abou,4,Abou Ben Adhem,92
1,0,abou,4,Abou Ben Adhem,92
2,0,abou,4,Abou Ben Adhem,92
3,0,abou,4,Abou Ben Adhem,92
4,0,accord,1,Abou Ben Adhem,92


In [27]:
# construct document-term matrix (one vector per poem)

# creates a list of all the unique tokens
all_tokens = list(tfidf_df['token'].unique())
# assigns an index to each unique token
token_to_index = {}
for id, token in enumerate(all_tokens):
  token_to_index[token] = id

# number of poems
num_poems = df.shape[0]
# number of unique tokens
num_tokens = len(all_tokens)

# initialize the tfidf matrix with zeros
matrix = np.zeros((num_poems, num_tokens))

# populate the matrix
# loop over each row in the tfidf_df
for index, row in tfidf_df.iterrows():
  # obtain poem id
  poem_id = row['id']
  # obtain token
  token = row['token']
  # obtain tfidf score for the token
  tfidf_score = row['tfidf']

  if token in token_to_index:
    #find index corresponding to token
    token_index = token_to_index[token]

    # assign tfidf score to the correct location in matrix
    # x-coord: poem_id
    # y-coord: token_index

    matrix[poem_id, token_index] = tfidf_score    

# displaying the matrix using dataframe
matrix_df = pd.DataFrame(matrix, columns=all_tokens)
if 'title' in df.columns:
    matrix_df.index = df['title']
elif 'id' in df.columns:
    matrix_df.index = df['id']
display(matrix_df)

Unnamed: 0,abou,accord,adhem,angel,answer,awok,ben,bless,bloom,bold,...,scrihner,stopt,tank,turmoil,unweari,wharton,discipl,goddess,needi,swiftest
0,0.238289,0.044504,0.238289,0.070058,0.020621,0.052038,0.190523,0.023748,0.028155,0.030137,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.016939,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.008805,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.074821,0.009438,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
236,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
237,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.017159,0.017159,0.017159,0.017159,0.017159,0.017159,0.000000,0.000000,0.000000,0.000000
238,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.017159,0.017159,0.017159,0.017159,0.017159,0.017159,0.000000,0.000000,0.000000,0.000000


In [37]:
# initialize centroids
np.random.seed(23)
k = 3
# chooses k random values from the matrix (centroids)
initial_centroids = matrix[np.random.choice(len(matrix), size=k, replace=False)]

# assign poems to nearest centroid
# compute Euclidean distance to each centroid
# reshapes and subtract centroids from each point
distances = np.linalg.norm(matrix[:, np.newaxis] - initial_centroids, axis=2)
# finds index of minimum value in each row- closest centroid
labels = np.argmin(distances, axis=1)

In [38]:
#repeats until convergence

max_iters = 100
#tolerance
convergence_threshold = 1e-4

centroids = initial_centroids
for iter in range(max_iters):
  # 1. assign poems to nearest centroid
  # compute Euclidean distance to each centroid
  # reshapes and subtract centroids from each point
  distances = np.linalg.norm(matrix[:, np.newaxis] - centroids, axis=2)
  # finds index of minimum value in each row- closest centroid
  labels = np.argmin(distances, axis=1)

  # 2. update centroids
  # calculates mean vector of all poems in cluster for new centroid
  new_centroids = []
  for i in range(k):
    cluster_points = matrix[labels == i]
    if len(cluster_points) == 0:
      # accounts for division by zero
      # initializing centroid randomly
      new_centroids.append(matrix[np.random.choice(len(matrix))])
    else:
      new_centroids.append(cluster_points.mean(axis=0))
  new_centroids = np.array(new_centroids)

  # 3. check for convergence
  # calculates the shifts between the old and new centroids
  shifts = np.linalg.norm(new_centroids - centroids)
  print(f"Iteration {iter+1}, shift: {shifts:.6f}")

  if shifts < convergence_threshold:
      print("Converged :)")
      break

  centroids = new_centroids

Iteration 1, shift: 0.195876
Iteration 2, shift: 0.000000
Converged :)


In [39]:
# output poem clusters
# add cluster labels to dataframe
df['cluster'] = labels
for i in range(k):
    print(f"\nCluster {i}:")
    print(df[df['cluster'] == i][['Title', 'Author']])




Cluster 0:
                               Title     Author
0                     Abou Ben Adhem       Hunt
1                    Abraham Lincoln  T. Taylor
2    All Things Bright and Beautiful  Alexander
3                  The American Flag      Drake
4       Answer to "Rock Me to Sleep"    Unknown
..                               ...        ...
235              The Wonderful World      Rands
236         Woodman, Spare That Tree     Morris
237                      You and You    Wharton
238             The Young Man Waited      Cooke
239                     Your Mission      Gates

[238 rows x 2 columns]

Cluster 1:
                    Title      Author
5  The Arrow and the Song  Longfellow

Cluster 2:
                Title   Author
140  "Not Understood"  Unknown


In [40]:
# extracting keywords
cluster_keywords = {}

for i in range(k):
    cluster_vecs = matrix[labels == i]
    avg_tfidf = cluster_vecs.mean(axis=0)
    
    top_indices = avg_tfidf.argsort()[-5:][::-1]
    top_tokens = [all_tokens[idx] for idx in top_indices]
    
    cluster_keywords[i] = top_tokens

# Print keywords
for cluster, keywords in cluster_keywords.items():
    print(f"Cluster {cluster} keywords: {keywords}")


Cluster 0 keywords: ['littl', 'sleep', 'thi', 'said', 'shall']
Cluster 1 keywords: ['arrow', 'song', 'unbrok', 'afterward', 'flight']
Cluster 2 keywords: ['understood', 'often', 'judg', 'nearer', 'estrang']


In [None]:
#find k value with elbow method and silhouette score...