# Using TF-IDF & cosine similarity to build a lyrically similar song search engine

based off this article https://alliescomputing.com/knowledge-base/christmas-carol-search-using-tf-idf-and-cosine-similarity 

In [1]:
import numpy as np 
import pandas as pd

#for top-5-similar songs recommender
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity


#for text preprocessing:
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load the dataset

In [2]:
# using the preprocessed lyrics dataset 
df = pd.read_csv('../data/preprocessed_dataset.csv')
df.dropna(inplace=True)

# Start of Recommender Algorithm:
---

## Determine the term frequencies (TFs)

In [3]:
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
cv = CountVectorizer() #type is CountVectorizer


In [4]:
# ~ 5 seconds

doc_term_matrix = cv.fit_transform(df['clean_lyrics']) #type is csr_matrix

## Determine the inverse document frequencies (IDFs)

In [5]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 


In [6]:
idfs.fit(doc_term_matrix)


TfidfTransformer()

In [7]:
type(idfs)

sklearn.feature_extraction.text.TfidfTransformer

In [8]:
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names(), columns=["idfs"]) 

## Put it all together to calculate the TF-IDFs

In [9]:
# NEED TO SAVE THIS FILE
# check the type
# check how it can be saved - what format?

# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

In [10]:
type(tf_idfs)

scipy.sparse.csr.csr_matrix

In [11]:
#to save tf_idfs & load

from scipy import sparse

sparse.save_npz("tf_idfs_top5.npz", tf_idfs)
# tf_idfs_top5 = sparse.load_npz("tf_idfs_top5.npz")

## Now prepare a search query from user input

In [12]:
# user input must be preprocessed before feeding into cv.transform([query])

# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_stopwords(test_string):
    # Break the sentence down into a list of words
    words = word_tokenize(test_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

# 4. function to break words into their stem words
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [13]:
# Pipeline function 

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_punc(a_string)
    #a_string = stem_words(a_string) #removing stem_words for now because making lyrics gibberish
    a_string = remove_stopwords(a_string)
    return a_string

In [14]:
#get user input for lyrics

query = input("Enter your lyrics: ") 
# I love you baby and if its quite alright i need you baby


Enter your lyrics: # I love you baby and if its quite alright i need you baby


In [15]:
#using user input 

#query = text_processing_pipeline(query) #use this function call for cleaning data in THIS notebook 

# query = cleaning_data.clean_data(query) #use this function call for streamlit app 

# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query]) #using user input 

In [16]:
# Across all of the terms, view the word counts for the query
query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names())

## Calculate the cosine similarity between the TF-IDFs and the query words 

In [17]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs, query_term_matrix)
results

array([[0.       ],
       [0.       ],
       [0.0030632],
       ...,
       [0.0211686],
       [0.       ],
       [0.       ]])

In [18]:
results = results.reshape((-1,))
results

array([0.       , 0.       , 0.0030632, ..., 0.0211686, 0.       ,
       0.       ])

## Show the results

In [19]:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-6:-1] returns the last 5 items, in reverse order
print("Search results for input: \n ")
print("{}".format(query))
print("\nTop 5 most similar songs based on lyrics are: \n")

for i in results.argsort()[:-6:-1]:
    if results[i] > 0:
        print("- {} at index {} with {}% match".format(df.loc[i].song_by_artist, df.iloc[i,0], round(100*results[i])))

Search results for input: 
 
# I love you baby and if its quite alright i need you baby

Top 5 most similar songs based on lyrics are: 

- Straight into a Storm by Deer Tick at index 80596 with 73% match
- A  for Andrew by Attack Attack! at index 139998 with 71% match
- Too Nice to Talk To by The English Beat at index 136880 with 67% match
- Turn It On by Lindsey Buckingham at index 141961 with 65% match
- Lesson Number One by Marshall Crenshaw at index 55783 with 65% match
