In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, Counter
from tqdm import tqdm
from langdetect import detect, DetectorFactory
import multiprocessing
import os
import pickle

In [68]:
# pip importer
!pip install langdetect

Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 784 kB/s eta 0:00:01
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993221 sha256=44c6145145bacc1f13efb41076d3cd325f994e85efad37a17fd325419f1fafa5
  Stored in directory: /Users/cardoni/Library/Caches/pip/wheels/13/c7/b0/79f66658626032e78fc1a83103690ef6797d551cb22e56e734
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
#track_df = pd.read_csv()
temp_path = os.getcwd()
root_path = temp_path.split('/napster_2')[0]
repo_path = '/napster_2/search_functionality/final_merged_data.csv'
practice_data_path = root_path + repo_path

og_path = '/napster_2/lyric_genius_api/practice_data.csv'
og_path = root_path + og_path
og_df = pd.read_csv(og_path)




In [4]:
test_df = pd.read_csv(practice_data_path)

In [5]:
test_df.shape
# need track name, track id, artist name, raw lyrics
test_df.columns
track_df = test_df[['artist_name', 'track_name', 'track_id', 'raw_lyrics']]

In [8]:
# need to remove tracks that are not in english
# interesting side application can we find the most similar track to a given track IN a different language?
def remove_non_english_tracks(raw_df):
    raw_df['language'] = raw_df['raw_lyrics'].apply(detect)
    raw_df = raw_df[raw_df.language == 'en']
    return raw_df

In [10]:
# steps for Rocchio Feedback Filter
# PROCESS 1 convert the raw lyrics into the concept space.
# 1. Create a TFIDF vectorizer
# 2. Create a document term matrix using TFIDF vec fit_transform using the raw lyrics. [I think there is an option to lemmatize here]
# 3. Complete latent semantic indexing using TruncatedSVD(num components = num comncepts, specifiy the random state)
# 4. Fit the document term matrix using TruncatedSVD.fit_transform. THESE ARE YOUR VECTORS FOR SIMILARITY SCORING

# PROCESS 2 convert the query into a vector
# 1. Convert querry into a raw string
# 2. Use the TFIDF vectorizer above to transform the querry
# 3. Use the LSI object above to convert the querry into the concept space.

# PROCESS 3 execute the search
# 1. Find the cosine similarity between the querry and all lyrics
# 2. Sort the tracks by similarity
# 3. Return the top N tracks to the user.

# PROCESS 4 Rochhio Feedback Filtering
# 1. Group user feeback by love, no answer, dislike
# 2. Calculate the mean for each group
# 3. Apply alpha, beta, gamma, and phi to:
#       Original search, loves, hates, nuetral
# 4. Update the lyric search querry vector and return new results!



In [23]:
# STEP 5 convert the notebook into functions.
def load_lyrics_data():
    """
    return a dataframe with cleaned lyrics
    """
    temp_path = os.getcwd()
    root_path = temp_path.split('/napster_2')[0]
    repo_path = '/napster_2/search_functionality/final_merged_data.csv'
    data_path = root_path + repo_path
    # step 1 import old lyrical data into a dataframe.
    lyric_df = pd.read_csv(data_path)
    lyric_df = lyric_df[['track_name', 'artist_name', 'track_id','raw_lyrics']]
    # drop duplicate instances of the same track.
    lyric_df = lyric_df.drop_duplicates(subset='raw_lyrics').reset_index(drop=True)
    lyric_df = lyric_df.dropna()
    # replace new line character
    lyric_df['raw_lyrics'].replace('\n', ' ',regex=True, inplace=True)
    # remove embed text from lyric genius API
    lyric_df['raw_lyrics'].replace('[0-9]{1,3}Embed', '', regex=True, inplace=True)
    return lyric_df

def create_lyric_tfidf(lyric_df, min_df):
    """ 
    Create a tfidf vectorizer for the track lyrics
    """
    tfidf = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
    # fit the TFIDF vectorizer
    tfidf.fit(lyric_df['raw_lyrics'])
    return tfidf

def lsi_lyrics(lyric_df, tfidf, num_concepts):
    """
    fit an LSI object using the lyrics
    """
    lyricTermMat = tfidf.transform(lyric_df['raw_lyrics'])
    lsiObj = TruncatedSVD(n_components=num_concepts, random_state=15)
    lsiObj.fit(lyricTermMat)
    return lsiObj

def create_lyric_vecs(lyric_df, lsiObj, tfidf):
    """ 
    generate the lyric vectors in the concept space
    """
    lyricTermMat = tfidf.transform(lyric_df['raw_lyrics'])
    lyric_vecs = lsiObj.transform(lyricTermMat)
    return lyric_vecs

def create_lyric_dictionary(lyric_df, lyric_vecs):
    """ 
    input a lyric dataframe and lsi lyric vectors
    return a dictionary where the track id is the key
    the vector is the value
    """
    # create a dataframe where the track id is the index the docVecs are the rows.
    track_vec_dict = defaultdict(list)
    track_ids = lyric_df['track_id'].values
    track_vec_dict = {track_ids[i]: lyric_vecs[i] for i in range(len(track_ids))}
    return track_vec_dict

def lsi_on_query(lsiObj, user_query, tfidf):
    """ 
    Transform the user search string into the concept space
    """
    userVec = tfidf.transform([user_query])
    # convert query vec into the concept space
    userLsi = lsiObj.transform(userVec)
    return userLsi

def retreive_30_tracks(lyric_df, userLsi, lyric_vecs):
    """ 
    calculate cosine similarity between lyrics and user query
    return the top 30 in a dataframe.
    """
    # calculate cosine similarity between every track and the lyric provided.
    simVals = cosine_similarity(lyric_vecs, userLsi)
    # create a track name, track id, artist name, similarity dataframe
    # need to make this a copy so that we do not modify the already stored data
    lyric_df['similarity'] = simVals
    user_playlist = lyric_df.sort_values(by='similarity', ascending=False)
    user_playlist = user_playlist.head(30)[['track_name', 'artist_name', 'track_id']]
    # initialize a feedback column and set every row to 0.
    user_playlist['feedback'] = 0
    return user_playlist

def simulate_user_input(user_playlist):
    """ 
    Simulate user input assign 0,1,2 to user feedback
    0 = nuetral
    1 = dislike
    2 = love
    """
    feedback = [np.random.randint(0,3) for i in range(len(user_playlist))]
    user_playlist['feedback'] = feedback
    return user_playlist

def rocchio_feedback(alpha, beta, gamma, phi, user_playlist, track_vec_dict):
    """ 
    Rochhio Feedback Filtering
    1. Group user feeback by love, nuetral, dislike
    2. Calculate the mean for each group
    3. Apply alpha, beta, gamma, and phi to:
           Original search, loves, hates, nuetral
    4. Update the lyric search querry vector and return new results!
    return an updated query vector to improve search results
    """
    # create a mean vector dict for all 3 states
    meanVectDict = defaultdict(list)
    # iterate through the three states nuetral[0], dislike[1], love[2]
    for i in range(3):
        temp_tracks = user_playlist[user_playlist['feedback']==i]
        if len(temp_tracks) > 0:
            # this means that tracks with this sentiment exist.
            # we can go get the track vectors from the track_vec_dict
            tempVecs = [track_vec_dict[vec] for vec in temp_tracks['track_id']]
            # next we need to calculate the mean vector for this segment
            meanVec = np.mean(tempVecs, axis=0)
            # add this mean to the mean vect dict. The key is the state.
            meanVectDict[i] = meanVec
        else:
            # if there are no tracks in this state, set its mean to 0
            meanVectDict[i] = 0
    # calcualte the new query vector by summing all of the mean vectors together
    newQueryVec = alpha*userLsi + beta * meanVectDict[2] - gamma * meanVectDict[1] + phi * meanVectDict[0]
    return newQueryVec

In [31]:
# Step 6 integrate all functions into a loop
user_query =  'Jealousy, turning saints into the sea Swimming through sick lullabies Choking on your alibis'
# load in lyrics dataframe
lyric_df = load_lyrics_data()
# create a tfidf vectorizer object
tfidf = create_lyric_tfidf(lyric_df, 10)
# create a latent semantic indexing object
lsiObj = lsi_lyrics(lyric_df, tfidf, 100)
# convert the lyrics into content vectors
lyric_vecs = create_lyric_vecs(lyric_df, lsiObj, tfidf)
# create a dictionary mapping track id to content vector
track_vec_dict = create_lyric_dictionary(lyric_df, lyric_vecs)
# convert a user string query into the concept space
userLsi = lsi_on_query(lsiObj, user_query, tfidf)
# create a user playlist and return a dataframe
user_playlist = retreive_30_tracks(lyric_df, userLsi, lyric_vecs)
# simulate user input while we are not connected to the GUI
user_playlist = simulate_user_input(user_playlist)
# Apply rocchio feedback filter to generate a better query
rocchioSearch = rocchio_feedback(1.0, 0.75, 0.25, 0.5, user_playlist, track_vec_dict)
# generate a new user playlist with the updated search
user_playlist = retreive_30_tracks(lyric_df, rocchioSearch, lyric_vecs)





In [37]:
def create_lsi_dict_pickle(track_vec_dict):
    pickle.dump(track_vec_dict, open('lsi_vec_dict.p', 'wb'))
def create_lsi_obj_pickle(lsiObj):
    pickle.dump(lsiObj, open('lsi_obj.p', 'wb'))
def create_tfidf_obj_pickle(tfidf):
    pickle.dump(tfidf, open('tfidf_obj.p', 'wb'))
def load_lsi_pickle():
    """ 
    read in the pickle file containing the fitted
    LSI object
    """
    with open('lsi_obj.p', 'rb') as lsi_file:
        lsiObj = pickle.load(lsi_file)
        return lsiObj
lsiObj = lsi_lyrics(lyric_df, tfidf, 100)
create_lsi_obj_pickle(lsiObj)


TruncatedSVD(n_components=100, random_state=15)

In [22]:
test_df.shape
track_artist_id_df = test_df[['track_name', 'artist_name', 'track_id']]
track_artist_id_df.to_csv('track_artist_id_df')

In [35]:
track_vec_dict

{'0cnn7HzqWw1FtY6HZQp0Ij': array([ 9.24163965e-02,  1.19195931e-01,  7.49313339e-01, -1.87358569e-02,
        -3.41152762e-02,  4.10602773e-03,  9.66209253e-03, -7.31784108e-03,
        -2.94194288e-03,  4.79740719e-03, -5.64479796e-03, -7.34857242e-03,
        -1.15921741e-02,  1.41221307e-02,  5.02441818e-03,  1.46918695e-03,
        -2.03340003e-03,  5.81242646e-03,  3.79179173e-03,  6.19027176e-03,
        -4.93081865e-03, -6.27781048e-03, -6.05013710e-03,  6.72502378e-03,
         1.40414587e-02,  1.87005367e-02,  9.30544782e-03,  3.25977448e-03,
         4.91206487e-03, -1.35180143e-03,  3.65594169e-03,  4.15465069e-03,
         1.11504893e-02, -9.67377777e-04, -1.13391256e-03,  1.65278146e-03,
         1.96954329e-03, -1.52030777e-04,  2.55213265e-03, -5.24576756e-04,
        -2.25854056e-03,  1.39037935e-02, -8.76418872e-03,  2.00256819e-03,
         1.03696230e-03, -7.49873577e-03,  7.73404525e-03,  7.28676432e-04,
         2.91758694e-03,  2.67269206e-02, -3.51097029e-03, -2.