# Using TF-IDF & cosine similarity to build a lyrically similar song search engine

based off this article https://alliescomputing.com/knowledge-base/christmas-carol-search-using-tf-idf-and-cosine-similarity 

In [1]:
import numpy as np 
import pandas as pd

#for top-5-similar songs recommender
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity


#for text preprocessing:
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load the dataset

In [2]:
# using the preprocessed lyrics dataset 
df = pd.read_csv('../data/preprocessed_dataset.csv')

In [3]:
print("Number of nulls: ", df.isnull().sum().sum())
print("Number of duplicates: ", df.duplicated().sum())
print("df shape: ", df.shape)
print("\n")
print("df value counts: \n")
print(df.genre.value_counts())
df.head()

Number of nulls:  3
Number of duplicates:  0
df shape:  (102285, 7)


df value counts: 

Pop        57357
Rock       26756
Country     7440
Rap         5959
R&B         4773
Name: genre, dtype: int64


Unnamed: 0.1,Unnamed: 0,song_by_artist,artist,song,label,genre,clean_lyrics
0,0,Everyday by Elijah Blake,Elijah Blake,Everyday,0.626,R&B,aint ever trapped bando oh lord dont get wrong...
1,1,Live Till We Die by Elijah Blake,Elijah Blake,Live Till We Die,0.63,Pop,drinks go smoke goes feel got let go cares get...
2,2,The Otherside by Elijah Blake,Elijah Blake,The Otherside,0.24,R&B,dont live planet earth found love venus thats ...
3,3,Pinot by Elijah Blake,Elijah Blake,Pinot,0.536,R&B,trippin grigio mobbin lights low trippin grigi...
4,4,Shadows & Diamonds by Elijah Blake,Elijah Blake,Shadows & Diamonds,0.371,R&B,see midnight panther gallant brave found found...


In [4]:
missing_values = df.isnull().values.any()
if(missing_values):
    display(df[df.isnull().any(axis=1)])

Unnamed: 0.1,Unnamed: 0,song_by_artist,artist,song,label,genre,clean_lyrics
7576,11977,No by Vivian Girls,Vivian Girls,No,0.348,Pop,
48580,76269,U Can't Handle This by Mystikal,Mystikal,U Can't Handle This,0.45,Rap,
94058,134658,Being Alive [*] by Michael Crawford,Michael Crawford,Being Alive [*],0.261,Pop,


In [5]:
# Remove records with missing values
df.dropna(inplace=True)
print("df shape: ", df.shape)

df shape:  (102282, 7)


# Start of Recommender Algorithm:
---

## Determine the term frequencies (TFs)

In [6]:
#MODEL? takes time to run ~ 4 seconds
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
#cv = CountVectorizer(stop_words='english')
cv = CountVectorizer() #type is CountVectorizer
doc_term_matrix = cv.fit_transform(df['clean_lyrics']) #type is csr_matrix

In [None]:
cv

## Perform some simple analysis

In [None]:
# Number of documents vs number of terms 
#doc_term_matrix.shape

In [None]:
# Get the terms - unique words excluding single char words like "a"
cv.get_feature_names()

In [None]:
# Check the number of terms
#len(cv.get_feature_names())

In [7]:
#time ~ 4 seconds

# View the word counts across all of the documents
word_counts = pd.DataFrame(doc_term_matrix.toarray(), index=df["song_by_artist"], columns=cv.get_feature_names())
word_counts

Unnamed: 0_level_0,00,000,00000,0000000,0017,007,00765,007style,008,01,...,zzq,zzt,zztt,zzu,zzz,zzzeed,zzzero,zzznoahh,zzzz,zzzzs
song_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everyday by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Live Till We Die by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Otherside by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Pinot by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Shadows & Diamonds by Elijah Blake,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ganja Babe by Michael Franti,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sandrevan Lullaby - Lifestyles by Rodriguez,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hard Rain Don't Last by Darryl Worley,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rat in Mi Kitchen by UB40,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#type(word_counts) # DataFrame - takes time to buile. Save?

In [None]:
# View the most and least frequent words
#word_counts.sum().sort_values(ascending=False)

## Determine the inverse document frequencies (IDFs)

In [None]:
import timeit

In [8]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 


In [9]:
idfs.fit(doc_term_matrix)


TfidfTransformer()

In [None]:
type(idfs) #type TfidfTransformer

In [10]:
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names(), columns=["idfs"]) 
 
# Sort ascending and display
# High IDF (1/DF) terms are less frequent across all documents; low IDF terms are more frequent 
idfs_df.sort_values(by=['idfs'], ascending=False)

Unnamed: 0,idfs
zzzzs,11.842352
janay,11.842352
jarheads,11.842352
jargoned,11.842352
steine,11.842352
...,...
love,1.979295
like,1.890908
dont,1.852549
know,1.800168


## Put it all together to calculate the TF-IDFs

In [11]:
# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

## Do some more analysis

In [None]:
# necessary?
# Create a data frame to view the TF-IDF scores for the first document, tf_idfs[0]
tf_idf_doc0 = pd.DataFrame(tf_idfs[0].T.todense(), index=cv.get_feature_names(), columns=["tf-idf"])
tf_idf_doc0.sort_values(by=["tf-idf"], ascending=False)

In [12]:
#time long. save? ~ 6 seconds
# Create a data frame to view all of the TF-IDF scores
tf_idf_all_docs = pd.DataFrame(tf_idfs.T.todense(), index=cv.get_feature_names())
tf_idf_all_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102272,102273,102274,102275,102276,102277,102278,102279,102280,102281
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzzeed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzzero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzznoahh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzzz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# time ~ 6 seconds

# Nicer if we re-orientate the scores so they're displayed in the same ways as the term frequencies at the top
# Use np.transpose to swap array rows and columns
tf_idf_all_docs_nicer = pd.DataFrame(np.transpose(tf_idfs.T.toarray()), index=df["song_by_artist"], columns=cv.get_feature_names())
tf_idf_all_docs_nicer

Unnamed: 0_level_0,00,000,00000,0000000,0017,007,00765,007style,008,01,...,zzq,zzt,zztt,zzu,zzz,zzzeed,zzzero,zzznoahh,zzzz,zzzzs
song_by_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everyday by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Live Till We Die by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Otherside by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pinot by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Shadows & Diamonds by Elijah Blake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ganja Babe by Michael Franti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sandrevan Lullaby - Lifestyles by Rodriguez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hard Rain Don't Last by Darryl Worley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rat in Mi Kitchen by UB40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# tf_idf_all_docs_nicer.to_csv()

## Now prepare a search query from user input

In [None]:
text_processing_pipeline

In [None]:
# # user input must be preprocessed before feeding into cv.transform([query])

# # 1. function that makes all text lowercase.
# def make_lowercase(test_string):
#     return test_string.lower()

# # 2. function that removes all punctuation. 
# def remove_punc(test_string):
#     test_string = re.sub(r'[^\w\s]', '', test_string)
#     return test_string

# # 3. function that removes all stopwords.
# def remove_stopwords(test_string):
#     # Break the sentence down into a list of words
#     words = word_tokenize(test_string)
    
#     # Make a list to append valid words into
#     valid_words = []
    
#     # Loop through all the words
#     for word in words:
        
#         # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
#         if word not in stopwords:
            
#             # If word not in stopwords, append to our valid_words
#             valid_words.append(word)

#     # Join the list of words together into a string
#     a_string = ' '.join(valid_words)

#     return a_string

# # 4. function to break words into their stem words
# def stem_words(a_string):
#     # Initalize our Stemmer
#     porter = PorterStemmer()
    
#     # Break the sentence down into a list of words
#     words = word_tokenize(a_string)
    
#     # Make a list to append valid words into
#     valid_words = []

#     # Loop through all the words
#     for word in words:
#         # Stem the word
#         stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
#         # Append stemmed word to our valid_words
#         valid_words.append(stemmed_word)
        
#     # Join the list of words together into a string
#     a_string = ' '.join(valid_words)

#     return a_string 

In [None]:
# # Pipeline function 

# def text_processing_pipeline(a_string):
#     a_string = make_lowercase(a_string)
#     a_string = remove_punc(a_string)
#     #a_string = stem_words(a_string) #removing stem_words for now because making lyrics gibberish
#     a_string = remove_stopwords(a_string)
#     return a_string

In [14]:
#get user input for lyrics

query = input("Enter your lyrics: ") 
# I am so happy to be in your arms once again I love you so much light of my life
# "Nice to meet you, where you been? I could show you incredible things Magic, madness, heaven, sin Saw you there and I thought Oh, my God, look at that face You look like my next mistake Love's a game, wanna play? Ay New money, suit and tie I can read you like a magazine Ain't it funny? Rumors fly And I know you heard about me So hey, let's be friends I'm dying to see how this one ends Grab your passport and my hand I can make the bad guys good for a weekend So it's gonna be forever Or it's gonna go down in flames You can tell me when it's over, mm If the high was worth the pain Got a long list of ex-lovers They'll tell you I'm insane 'Cause you know I love the players And you love the game"


Enter your lyrics: hello my friend its me


In [16]:
#using user input 

# query = text_processing_pipeline(query) #use this function call for cleaning data in THIS notebook 

query = cleaning_data.clean_data(query) #use this function call for streamlit app 

# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query]) #using user input 

NameError: name 'dataset_preprocessing' is not defined

In [None]:
# Across all of the terms, view the word counts for the query
query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names())

# Query term counts, showing all terms within the documents
# query_counts

# Query term counts, showing just the query terms (shows what we know already of course)
#query_counts[query.split(" ")]

## Calculate the cosine similarity between the TF-IDFs and the query words 

In [None]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs, query_term_matrix)
results

In [None]:
results = results.reshape((-1,))
results

## Show the results

In [None]:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-6:-1] returns the last 5 items, in reverse order
print("Search results for input: \n ")
print("{}".format(query))
print("\nTop 5 most similar songs based on lyrics are: \n")

for i in results.argsort()[:-6:-1]:
    if results[i] > 0:
        print("- {} at index {} with {}% match".format(df.loc[i].song_by_artist, df.iloc[i,0], round(100*results[i])))