In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, Counter
from tqdm import tqdm
from langdetect import detect, DetectorFactory
import multiprocessing
import os
import pickle

In [3]:
# steps for Rocchio Feedback Filter
# PROCESS 1 convert the raw lyrics into the concept space.
# 1. Create a TFIDF vectorizer
# 2. Create a document term matrix using TFIDF vec fit_transform using the raw lyrics. [I think there is an option to lemmatize here]
# 3. Complete latent semantic indexing using TruncatedSVD(num components = num comncepts, specifiy the random state)
# 4. Fit the document term matrix using TruncatedSVD.fit_transform. THESE ARE YOUR VECTORS FOR SIMILARITY SCORING

# PROCESS 2 convert the query into a vector
# 1. Convert querry into a raw string
# 2. Use the TFIDF vectorizer above to transform the querry
# 3. Use the LSI object above to convert the querry into the concept space.

# PROCESS 3 execute the search
# 1. Find the cosine similarity between the querry and all lyrics
# 2. Sort the tracks by similarity
# 3. Return the top N tracks to the user.

# PROCESS 4 Rochhio Feedback Filtering
# 1. Group user feeback by love, no answer, dislike
# 2. Calculate the mean for each group
# 3. Apply alpha, beta, gamma, and phi to:
#       Original search, loves, hates, nuetral
# 4. Update the lyric search querry vector and return new results!

In [63]:
# convert the notebook into functions.
def load_lyrics_data():
    """
    return a dataframe with cleaned lyrics
    """
    temp_path = os.getcwd()
    root_path = temp_path.split('/napster_2')[0]
    repo_path = '/napster_2/search_functionality/final_merged_data.csv'
    data_path = root_path + repo_path
    # step 1 import old lyrical data into a dataframe.
    lyric_df = pd.read_csv(data_path)
    lyric_df = lyric_df[['track_name', 'artist_name', 'track_id','raw_lyrics']]
    # drop duplicate instances of the same track.
    lyric_df = lyric_df.drop_duplicates(subset='raw_lyrics').reset_index(drop=True)
    lyric_df = lyric_df.dropna()
    # replace new line character
    lyric_df['raw_lyrics'].replace('\n', ' ',regex=True, inplace=True)
    # remove embed text from lyric genius API
    lyric_df['raw_lyrics'].replace('[0-9]{1,3}Embed', '', regex=True, inplace=True)
    return lyric_df

def create_lyric_tfidf(lyric_df, min_df):
    """ 
    Create a tfidf vectorizer for the track lyrics
    """
    tfidf = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
    # fit the TFIDF vectorizer
    tfidf.fit(lyric_df['raw_lyrics'])
    return tfidf

def lsi_lyrics(lyric_df, tfidf, num_concepts):
    """
    fit an LSI object using the lyrics
    """
    lyricTermMat = tfidf.transform(lyric_df['raw_lyrics'])
    lsiObj = TruncatedSVD(n_components=num_concepts, random_state=15)
    lsiObj.fit(lyricTermMat)
    return lsiObj

def create_lyric_vecs(lyric_df, lsiObj, tfidf):
    """ 
    generate the lyric vectors in the concept space
    """
    lyricTermMat = tfidf.transform(lyric_df['raw_lyrics'])
    lyric_vecs = lsiObj.transform(lyricTermMat)
    return lyric_vecs

def create_lyric_dictionary(lyric_df, lyric_vecs):
    """ 
    input a lyric dataframe and lsi lyric vectors
    return a dictionary where the track id is the key
    the vector is the value
    """
    # create a dataframe where the track id is the index the docVecs are the rows.
    track_vec_dict = defaultdict(list)
    track_ids = lyric_df['track_id'].values
    track_vec_dict = {track_ids[i]: lyric_vecs[i] for i in range(len(track_ids))}
    return track_vec_dict

def lsi_on_query(lsiObj, user_query, tfidf):
    """ 
    Transform the user search string into the concept space
    """
    userVec = tfidf.transform([user_query])
    # convert query vec into the concept space
    userLsi = lsiObj.transform(userVec)
    return userLsi

def retrieve_10_tracks(lyric_df, userLsi, lyric_vecs):
    """ 
    calculate cosine similarity between lyrics and user query
    return the top 10 in a dataframe.
    """
    # calculate cosine similarity between every track and the lyric provided.
    simVals = cosine_similarity(lyric_vecs, userLsi)
    # create a track name, track id, artist name, similarity dataframe
    # need to make this a copy so that we do not modify the already stored data
    lyric_df['similarity'] = simVals
    user_playlist = lyric_df.sort_values(by='similarity', ascending=False)
    user_playlist = user_playlist.head(10)[['track_name', 'artist_name', 'track_id']]
    # initialize a feedback column and set every row to 0.
    user_playlist['feedback'] = 0
    return user_playlist

def simulate_user_input(user_playlist):
    """ 
    Simulate user input assign 0,1,2 to user feedback
    0 = nuetral
    1 = dislike
    2 = love
    """
    feedback = [np.random.randint(0,3) for i in range(len(user_playlist))]
    user_playlist['feedback'] = feedback
    return user_playlist

def rocchio_feedback(alpha, beta, gamma, phi, user_playlist, track_vec_dict, userLsi):
    """ 
    Rochhio Feedback Filtering
    1. Group user feeback by love, nuetral, dislike
    2. Calculate the mean for each group
    3. Apply alpha, beta, gamma, and phi to:
           Original search, loves, hates, nuetral
    4. Update the lyric search querry vector and return new results!
    return an updated query vector to improve search results
    """
    # create a mean vector dict for all 3 states
    meanVectDict = defaultdict(list)
    # iterate through the three states nuetral[0], dislike[1], love[2]
    for i in range(3):
        temp_tracks = user_playlist[user_playlist['feedback']==i]
        if len(temp_tracks) > 0:
            # this means that tracks with this sentiment exist.
            # we can go get the track vectors from the track_vec_dict
            tempVecs = [track_vec_dict[vec] for vec in temp_tracks['track_id']]
            # next we need to calculate the mean vector for this segment
            meanVec = np.mean(tempVecs, axis=0)
            # add this mean to the mean vect dict. The key is the state.
            meanVectDict[i] = meanVec
        else:
            # if there are no tracks in this state, set its mean to 0
            meanVectDict[i] = 0
    # calcualte the new query vector by summing all of the mean vectors together
    newQueryVec = alpha*userLsi + beta * meanVectDict[2] - gamma * meanVectDict[1] + phi * meanVectDict[0]
    return newQueryVec

In [31]:
# All functions from scratch.
user_query =  'Jealousy, turning saints into the sea Swimming through sick lullabies Choking on your alibis'
# load in lyrics dataframe
lyric_df = load_lyrics_data()
# create a tfidf vectorizer object
tfidf = create_lyric_tfidf(lyric_df, 10)
# create a latent semantic indexing object
lsiObj = lsi_lyrics(lyric_df, tfidf, 100)
# convert the lyrics into content vectors
lyric_vecs = create_lyric_vecs(lyric_df, lsiObj, tfidf)
# create a dictionary mapping track id to content vector
track_vec_dict = create_lyric_dictionary(lyric_df, lyric_vecs)
# convert a user string query into the concept space
userLsi = lsi_on_query(lsiObj, user_query, tfidf)
# create a user playlist and return a dataframe
user_playlist = retrieve_10_tracks(lyric_df, userLsi, lyric_vecs)
# simulate user input while we are not connected to the GUI
user_playlist = simulate_user_input(user_playlist)
# Apply rocchio feedback filter to generate a better query
rocchioSearch = rocchio_feedback(1.0, 0.75, 0.25, 0.5, user_playlist, track_vec_dict, userLsi)
# generate a new user playlist with the updated search
user_playlist = retrieve_10_tracks(lyric_df, rocchioSearch, lyric_vecs)



In [15]:
def create_lsi_dict_pickle(track_vec_dict):
    pickle.dump(track_vec_dict, open('lsi_vec_dict.p', 'wb'))
def create_lsi_obj_pickle(lsiObj):
    pickle.dump(lsiObj, open('lsi_obj.p', 'wb'))
def create_tfidf_obj_pickle(tfidf):
    pickle.dump(tfidf, open('tfidf_obj.p', 'wb'))
def load_lsi_pickle():
    """ 
    read in the pickle file containing the fitted
    LSI object
    """
    with open('lsi_obj.p', 'rb') as lsi_file:
        lsiObj = pickle.load(lsi_file)
        lsi_file.close()
        return lsiObj
def load_tfidf_pickle():
    """ 
    read in the pickle file containing the fitted
    TFIDF object
    """
    with open('tfidf_obj.p', 'rb') as tfidf_file:
        tfidf = pickle.load(tfidf_file)
        tfidf_file.close()
        return tfidf
def load_lsi_dict_pickle():
    """ 
    read in the pickle file containing the fitted
    TFIDF object
    """
    with open('lsi_vec_dict.p', 'rb') as lyric_file:
        lryicVecs = pickle.load(lyric_file)
        lyric_file.close()
        return lryicVecs

In [66]:
# production work flow (bypass creating new objects by reading in their pickle instead)
user_query =  'Jealousy, turning saints into the sea Swimming through sick lullabies Choking on your alibis'
# load in lyrics dataframe
lyric_df = pd.read_csv('track_artist_id_df')
# create a tfidf vectorizer object
tfidf = load_tfidf_pickle()
# create a latent semantic indexing object
lsiObj = load_lsi_pickle()
# convert the lyrics into content vectors This is stored in the dictionary
lyric_vecs = np.array(list(track_vec_dict.values()))
# create a dictionary mapping track id to content vector
track_vec_dict = load_lsi_dict_pickle()
# convert a user string query into the concept space
userLsi = lsi_on_query(lsiObj, user_query, tfidf)
# create a user playlist and return a dataframe
user_playlist = retrieve_10_tracks(lyric_df, userLsi, lyric_vecs)
# simulate user input while we are not connected to the GUI
user_playlist = simulate_user_input(user_playlist)
# Apply rocchio feedback filter to generate a better query
rocchioSearch = rocchio_feedback(1.0, 0.75, 0.25, 0.5, user_playlist, track_vec_dict, userLsi)
# generate a new user playlist with the updated search
user_playlist = retrieve_10_tracks(lyric_df, rocchioSearch, lyric_vecs)

In [103]:
class Napster2_Rocchio_Feedback():
    """ 
    Object for containing all relevant Rocchio Search functionality
    """
    def __init__(self):
        self.query = None
        self.user_playlist = None
        self.main_path = self.base_path()
        self.lyric_vec_dict = self.load_lsi_dict_pickle()
        self.lyric_vecs = np.array(list(self.lyric_vec_dict.values()))
        self.lsiObj = self.load_lsi_pickle()
        self.tfidf = self.load_tfidf_pickle()
        self.all_tracks_df = pd.read_csv(self.main_path+'track_artist_id_df')
    
    def base_path(self):
            temp_path = os.getcwd()
            root_path = temp_path.split('/napster_2')[0]
            repo_path = '/napster_2/search_functionality/'
            return root_path + repo_path

    def userSearch(self, user_query):
        """ 
        Transform the user search string into the concept space
        """
        userVec = self.tfidf.transform([user_query])
        # convert query vec into the concept space
        userLsi = self.lsiObj.transform(userVec)
        self.query = userLsi

    def load_lsi_pickle(self):
        """ 
        read in the pickle file containing the fitted
        LSI object
        """
        temp_path = self.main_path + 'lsi_obj.p'
        with open(temp_path, 'rb') as lsi_file:
            lsiObj = pickle.load(lsi_file)
            lsi_file.close()
            return lsiObj
    def load_tfidf_pickle(self):
        """ 
        read in the pickle file containing the fitted
        TFIDF object
        """
        temp_path = self.main_path + 'tfidf_obj.p'
        with open(temp_path, 'rb') as tfidf_file:
            tfidf = pickle.load(tfidf_file)
            tfidf_file.close()
            return tfidf
    def load_lsi_dict_pickle(self):
        """ 
        read in the pickle file containing the fitted
        TFIDF object
        """
        temp_path = self.main_path +'lsi_vec_dict.p' 
        with open(temp_path, 'rb') as lyric_file:
            lryicVecs = pickle.load(lyric_file)
            lyric_file.close()
            return lryicVecs

    def create_user_playlist(self): #lyric_df, userLsi, lyric_vecs):
        """ 
        calculate cosine similarity between lyrics and user query
        return the top 10 in a dataframe.
        """
        # calculate cosine similarity between every track and the lyric provided.
        simVals = cosine_similarity(self.lyric_vecs, self.query)
        # create a track name, track id, artist name, similarity dataframe
        # need to make this a copy so that we do not modify the already stored data
        self.all_tracks_df['similarity'] = simVals
        self.user_playlist = lyric_df.sort_values(by='similarity', ascending=False).head(10)[['track_name', 'artist_name', 'track_id']]
        self.user_playlist['feedback'] = 0

    def return_top_10_tracks(self):
        """ 
        return the top 10 tracks based on cosine similarity
        """
        return self.user_playlist

    def apply_feedback(self, feedback_series):
        """ 
        User input assign 0,1,2 to user feedback
        0 = nuetral
        1 = dislike
        2 = love
        """
        self.user_playlist['feedback'] = feedback_series

    def rocchio_feedback(self, alpha=1.0, beta=0.75, gamma=0.25, phi=0.5):#, user_playlist, track_vec_dict, userLsi):
        """ 
        Rochhio Feedback Filtering
        1. Group user feeback by love, nuetral, dislike
        2. Calculate the mean for each group
        3. Apply alpha, beta, gamma, and phi to:
            Original search, loves, hates, nuetral
        4. Update the lyric search querry vector and return new results!
        return an updated query vector to improve search results
        """
        # create a mean vector dict for all 3 states
        meanVectDict = defaultdict(list)
        # iterate through the three states nuetral[0], dislike[1], love[2]
        for i in range(3):
            temp_tracks = self.user_playlist[self.user_playlist['feedback']==i]
            if len(temp_tracks) > 0:
                # this means that tracks with this sentiment exist.
                # we can go get the track vectors from the track_vec_dict
                tempVecs = [self.lyric_vec_dict[vec] for vec in temp_tracks['track_id']]
                # next we need to calculate the mean vector for this segment
                meanVec = np.mean(tempVecs, axis=0)
                # add this mean to the mean vect dict. The key is the state.
                meanVectDict[i] = meanVec
            else:
                # if there are no tracks in this state, set its mean to 0
                meanVectDict[i] = 0
        # calcualte the new query vector by summing all of the mean vectors together
        newQueryVec = alpha*self.query + beta * meanVectDict[2] - gamma * meanVectDict[1] + phi * meanVectDict[0]
        self.query = newQueryVec
        # update the search after making a new query vector
        self.create_user_playlist()
    

In [104]:
myRFF = Napster2_Rocchio_Feedback()

In [105]:
myRFF.userSearch('Jealousy, turning saints into the sea')

In [114]:
myRFF.create_user_playlist()
myRFF.apply_feedback([1,2,1,2,1,2,0,1,1,0])
myRFF.user_playlist
myRFF.rocchio_feedback()
pd.DataFrame(myRFF.return_top_10_tracks().iloc[[0]])


Unnamed: 0,track_name,artist_name,track_id,feedback
69605,On Your Own,Meltt,1jUp1Yu76G1mXzr224YKHb,0
