# Using TF-IDF & cosine similarity to build a lyrically similar song search engine

based off this article https://alliescomputing.com/knowledge-base/christmas-carol-search-using-tf-idf-and-cosine-similarity 

In [2]:
import numpy as np 
import pandas as pd

#for top-5-similar songs recommender
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

#for text preprocessing:
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aleksandrageorgievska/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load the dataset

In [3]:
# using the preprocessed lyrics dataset 
df = pd.read_csv('../data/preprocessed_dataset.csv')
df.dropna(inplace=True)

# Start of Recommender Algorithm:
---

## Determine the term frequencies (TFs)

In [4]:
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
cv = CountVectorizer() #type is CountVectorizer


In [None]:
# ~ 7 seconds

doc_term_matrix = cv.fit_transform(df['clean_lyrics']) #type is csr_matrix

In [None]:
    # print(type(doc_term_matrix))
    # print(doc_term_matrix.shape)

In [None]:
    # print(doc_term_matrix)

In [None]:
    #sparse.save_npz("doc_term_matrix.npz", doc_term_matrix)


In [None]:
    #doc_term_matrix = sparse.load_npz("doc_term_matrix.npz")

In [None]:
    # print(type(doc_term_matrix))
    # print(doc_term_matrix.shape)

In [None]:
    # print(type(doc_term_matrix))

In [None]:
    # print(doc_term_matrix)

## Determine the inverse document frequencies (IDFs)

In [None]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 


In [None]:
# NEED THIS TOO 
idfs.fit(doc_term_matrix)


In [None]:
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names(), columns=["idfs"]) 

In [None]:
type(idfs)

In [None]:
cv.get_feature_names()

## Put it all together to calculate the TF-IDFs

In [None]:
# NEED TO SAVE THIS FILE
# check the type
# check how it can be saved - what format?

# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

In [None]:
type(tf_idfs)

In [None]:
print(tf_idfs.shape)

In [None]:
print(tf_idfs)

In [None]:
#to save tf_idfs & load



sparse.save_npz("tf_idfs_top5.npz", tf_idfs)


In [5]:
tf_idfs_top5 = sparse.load_npz("tf_idfs_top5.npz")

In [None]:
print(tf_idfs_top5.shape)

In [None]:
print(tf_idfs_top5)

## Now prepare a search query from user input

In [6]:
# user input must be preprocessed before feeding into cv.transform([query])

# 1. function that makes all text lowercase.
def make_lowercase(test_string):
    return test_string.lower()

# 2. function that removes all punctuation. 
def remove_punc(test_string):
    test_string = re.sub(r'[^\w\s]', '', test_string)
    return test_string

# 3. function that removes all stopwords.
def remove_stopwords(test_string):
    # Break the sentence down into a list of words
    words = word_tokenize(test_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords. Stopwords was imported from nltk.corpus
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string

# 4. function to break words into their stem words
def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word) #from nltk.stem import PorterStemmer
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 

In [7]:
# Pipeline function 

def text_processing_pipeline(a_string):
    a_string = make_lowercase(a_string)
    a_string = remove_punc(a_string)
    #a_string = stem_words(a_string) #removing stem_words for now because making lyrics gibberish
    a_string = remove_stopwords(a_string)
    return a_string

In [8]:
#get user input for lyrics

query = "I love you baby and if it's quite alright I need you baby to warm the lonley night"
# I love you baby and if its quite alright i need you baby


In [9]:
query = text_processing_pipeline(query) #use this function call for cleaning data in THIS notebook 

    #query = cleaning_data.clean_data(query) #use this function call for streamlit app 



In [10]:
# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query]) #using user input 

NotFittedError: Vocabulary not fitted or provided

In [None]:
            # Across all of the terms, view the word counts for the query
#             query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names())

## Calculate the cosine similarity between the TF-IDFs and the query words 

In [None]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs_top5, query_term_matrix)
results

In [None]:
results = results.reshape((-1,))
results

## Show the results

In [None]:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-6:-1] returns the last 5 items, in reverse order
print("Search results for input: \n ")
print("{}".format(query))
print("\nTop 5 most similar songs based on lyrics are: \n")

for i in results.argsort()[:-6:-1]:
    if results[i] > 0:
        print("- {} at index {} with {}% match".format(df.loc[i].song_by_artist, df.iloc[i,0], round(100*results[i])))