# Using TF-IDF & cosine similarity to build a lyrically similar song search engine

In [1]:
import numpy as np
import pandas as pd

#for top-5-similar songs recommender
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity


#for text preprocessing:
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load the dataset

In [2]:
# using the preprocessed lyrics dataset 
df = pd.read_csv('../data/preprocessed_dataset.csv')

In [3]:
print("Number of nulls: ", df.isnull().sum().sum())
print("Number of duplicates: ", df.duplicated().sum())
print("df shape: ", df.shape)
print("\n")
print("df value counts: \n")
print(df.genre.value_counts())
df.head()

Number of nulls:  3
Number of duplicates:  0
df shape:  (102285, 7)


df value counts: 

Pop        57357
Rock       26756
Country     7440
Rap         5959
R&B         4773
Name: genre, dtype: int64


Unnamed: 0.1,Unnamed: 0,song_by_artist,artist,song,label,genre,clean_lyrics
0,0,Everyday by Elijah Blake,Elijah Blake,Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,1,Live Till We Die by Elijah Blake,Elijah Blake,Live Till We Die,0.63,Pop,drinks go smoke goes feel got let go cares get...
2,2,The Otherside by Elijah Blake,Elijah Blake,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,3,Pinot by Elijah Blake,Elijah Blake,Pinot,0.536,R&B,trippin grigio mobbin lights low trippin grigi...
4,4,Shadows & Diamonds by Elijah Blake,Elijah Blake,Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [4]:
missing_values = df.isnull().values.any()
if(missing_values):
    display(df[df.isnull().any(axis=1)])

Unnamed: 0.1,Unnamed: 0,song_by_artist,artist,song,label,genre,clean_lyrics
7576,11977,No by Vivian Girls,Vivian Girls,No,0.348,Pop,
48580,76269,U Can't Handle This by Mystikal,Mystikal,U Can't Handle This,0.45,Rap,
94058,134658,Being Alive [*] by Michael Crawford,Michael Crawford,Being Alive [*],0.261,Pop,


In [5]:
# Remove records with missing values
df.dropna(inplace=True)

# Start of Recommender Algorithm:
---

## Determine the term frequencies (TFs)

In [6]:
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
#cv = CountVectorizer(stop_words='english')
cv = CountVectorizer()
doc_term_matrix = cv.fit_transform(df['clean_lyrics'])

## Perform some simple analysis

In [7]:
# Number of documents vs number of terms 
doc_term_matrix.shape

(102282, 142030)

In [8]:
# Get the terms - unique words excluding single char words like "a"
cv.get_feature_names_out()

array(['00', '000', '00000', ..., 'zzznoahh', 'zzzz', 'zzzzs'],
      dtype=object)

In [9]:
# Check the number of terms
len(cv.get_feature_names_out())

142030

In [10]:
# View the word counts across all of the documents
word_counts = pd.DataFrame(doc_term_matrix.toarray(), index=df["song_by_artist"], columns=cv.get_feature_names_out())
word_counts

Unnamed: 0_level_0,00,000,00000,0000000,0017,007,00765,007style,008,01,...,zzq,zzt,zztt,zzu,zzz,zzzeed,zzzero,zzznoahh,zzzz,zzzzs
song_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everyday by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Live Till We Die by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Otherside by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Pinot by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Shadows & Diamonds by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ganja Babe by Michael Franti,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sandrevan Lullaby - Lifestyles by Rodriguez,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hard Rain Don't Last by Darryl Worley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rat in Mi Kitchen by UB40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# View the most and least frequent words
word_counts.sum().sort_values(ascending=False)

im                      192356
love                    145869
dont                    135653
know                    129602
like                    123559
                         ...  
meeeeeeeeeeeeeeeeeee         1
meeethe                      1
meegemaakt                   1
meekly                       1
legio                        1
Length: 142030, dtype: int64

In [12]:
# View the word counts for certain words
word_counts[["love", "baby"]]

Unnamed: 0_level_0,love,baby
song_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Everyday by Elijah Blake,0,0
Live Till We Die by Elijah Blake,0,0
The Otherside by Elijah Blake,1,0
Pinot by Elijah Blake,3,7
Shadows & Diamonds by Elijah Blake,9,0
...,...,...
Ganja Babe by Michael Franti,6,1
Sandrevan Lullaby - Lifestyles by Rodriguez,0,0
Hard Rain Don't Last by Darryl Worley,1,0
Rat in Mi Kitchen by UB40,0,0


## Determine the inverse document frequencies (IDFs)

In [13]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 
idfs.fit(doc_term_matrix)

In [14]:
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names_out(), columns=["idfs"]) 
 
# Sort ascending and display
# High IDF (1/DF) terms are less frequent across all documents; low IDF terms are more frequent 
idfs_df.sort_values(by=['idfs'], ascending=False)

Unnamed: 0,idfs
zzzzs,11.842352
janay,11.842352
jarheads,11.842352
jargoned,11.842352
steine,11.842352
...,...
love,1.979295
like,1.890908
dont,1.852549
know,1.800168


## Put it all together to calculate the TF-IDFs

In [15]:
# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

## Do some more analysis

In [16]:
# Create a data frame to view the TF-IDF scores for the first document, tf_idfs[0]
tf_idf_doc0 = pd.DataFrame(tf_idfs[0].T.todense(), index=cv.get_feature_names_out(), columns=["tf-idf"])
tf_idf_doc0.sort_values(by=["tf-idf"], ascending=False)

Unnamed: 0,tf-idf
everyday,0.911214
thats,0.188337
nigga,0.111225
spectacular,0.108095
flexin,0.105565
...,...
foretold,0.000000
foretells,0.000000
foretelling,0.000000
foreteller,0.000000


In [17]:
# Create a data frame to view all of the TF-IDF scores
tf_idf_all_docs = pd.DataFrame(tf_idfs.T.todense(), index=cv.get_feature_names_out())
tf_idf_all_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102272,102273,102274,102275,102276,102277,102278,102279,102280,102281
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzzeed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzzero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzznoahh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzzz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Nicer if we re-orientate the scores so they're displayed in the same ways as the term frequencies at the top
# Use np.transpose to swap array rows and columns
tf_idf_all_docs_nicer = pd.DataFrame(np.transpose(tf_idfs.T.toarray()), index=df["song_by_artist"], columns=cv.get_feature_names_out())
tf_idf_all_docs_nicer

Unnamed: 0_level_0,00,000,00000,0000000,0017,007,00765,007style,008,01,...,zzq,zzt,zztt,zzu,zzz,zzzeed,zzzero,zzznoahh,zzzz,zzzzs
song_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everyday by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Live Till We Die by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Otherside by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pinot by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Shadows & Diamonds by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ganja Babe by Michael Franti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sandrevan Lullaby - Lifestyles by Rodriguez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hard Rain Don't Last by Darryl Worley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rat in Mi Kitchen by UB40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Even better, let's just display the TF-IDFs for certain words of interest
tf_idf_all_docs_nicer[["baby", "love"]]

Unnamed: 0_level_0,baby,love
song_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1
Everyday by Elijah Blake,0.000000,0.000000
Live Till We Die by Elijah Blake,0.000000,0.000000
The Otherside by Elijah Blake,0.000000,0.008664
Pinot by Elijah Blake,0.090785,0.027585
Shadows & Diamonds by Elijah Blake,0.000000,0.200372
...,...,...
Ganja Babe by Michael Franti,0.015884,0.067569
Sandrevan Lullaby - Lifestyles by Rodriguez,0.000000,0.000000
Hard Rain Don't Last by Darryl Worley,0.000000,0.059874
Rat in Mi Kitchen by UB40,0.000000,0.000000


## Now prepare a search query

In [20]:
# Now let's perform a simple query that looks for the following words
query = "love baby right night"

# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query])

In [21]:
# Across all of the terms, view the word counts for the query
query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names_out())

# Query term counts, showing all terms within the documents
# query_counts

# Query term counts, showing just the query terms (shows what we know already of course)
query_counts[query.split(" ")]

Unnamed: 0,love,baby,right,night
0,1,1,1,1


## Calculate the cosine similarity between the TF-IDFs and the query words 

In [22]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs, query_term_matrix)
results

array([[0.        ],
       [0.01745423],
       [0.00433201],
       ...,
       [0.02993693],
       [0.        ],
       [0.        ]])

In [23]:
results = results.reshape((-1,))
results

array([0.        , 0.01745423, 0.00433201, ..., 0.02993693, 0.        ,
       0.        ])

## Show the results

In [25]:
# Print the top 10 search results - voila, hopefully!
# Dan's note to self:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-11:-1] returns the last 10 items, in reverse order
print("Search results for: '{}'".format(query))
for i in results.argsort()[:-6:-1]:
    if results[i] > 0:
        print("{} at index {} with {}% match".format(df.loc[i].song_by_artist, df.iloc[i,0], round(100*results[i])))

Search results for: 'love baby right night'
Turn It On by Lindsey Buckingham at index 141961 with 65% match
Turn It Up by Whiskey Myers at index 28390 with 63% match
Lost by Elegy at index 99410 with 60% match
Straight into a Storm by Deer Tick at index 80596 with 60% match
Simple Enough by Never Shout Never at index 123400 with 60% match
