In [90]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import os

In [79]:
#track_df = pd.read_csv()
temp_path = os.getcwd()
root_path = temp_path.split('/napster_2')[0]
repo_path = '/napster_2/lyric_genius_api/practice_data.csv'
practice_data_path = root_path + repo_path




In [80]:
track_df = pd.read_csv(practice_data_path)
print(track_df.genre.unique())
rel_columns = ['track_id', 'track_name_x', 'artist_name_x']
track_df = track_df[rel_columns]
track_df.rename(columns={'track_id': 'spotify_id',
                        'track_name_x': 'track_name',
                        'artist_name_x': 'artist_name'
})
#track_df.sample(30, random_state=15)


['alt-rock' 'latin' 'country' 'classical' 'hip-hop' 'edm' 'heavy-metal']


Unnamed: 0,spotify_id,track_name,artist_name
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood
1,2K7xn816oNHJZ0aVqdQsha,Softcore,The Neighbourhood
2,5E30LdtzQTGqRvNd7l6kG5,Daddy Issues,The Neighbourhood
3,2iUmqdfGZcHIhS3b9E9EWq,Everybody Talks,Neon Trees
4,7zwn1eykZtZ5LODrf7c0tS,You Get Me So High,The Neighbourhood
...,...,...,...
7045,1XgpK29CGGjZnxPYkiRbh4,Hey DJ,CNCO
7046,5q2MjTDby29ZOEigCVV28a,In My Hood,South Park Mexican
7047,0XpMTExp5q4nLZZZ3msDGn,La Funka,Ozuna
7048,0ulsRBiciReng91DhfVT9D,Bebé,Ozuna


In [81]:
# steps for Rocchio Feedback Filter
# PROCESS 1 convert the raw lyrics into the concept space.
# 1. Create a TFIDF vectorizer
# 2. Create a document term matrix using TFIDF vec fit_transform using the raw lyrics. [I think there is an option to lemmatize here]
# 3. Complete latent semantic indexing using TruncatedSVD(num components = num comncepts, specifiy the random state)
# 4. Fit the document term matrix using TruncatedSVD.fit_transform. THESE ARE YOUR VECTORS FOR SIMILARITY SCORING

# PROCESS 2 convert the query into a vector
# 1. Convert querry into a raw string
# 2. Use the TFIDF vectorizer above to transform the querry
# 3. Use the LSI object above to convert the querry into the concept space.

# PROCESS 3 execute the search
# 1. Find the cosine similarity between the querry and all lyrics
# 2. Sort the tracks by similarity
# 3. Return the top N tracks to the user.

# PROCESS 4 Rochhio Feedback Filtering
# 1. Group user feeback by love, no answer, dislike
# 2. Calculate the mean for each group
# 3. Apply alpha, beta, gamma, and phi to:
#       Original search, loves, hates, nuetral
# 4. Update the lyric search querry vector and return new results!



In [82]:
# Tuning Parameters
# minimum document frequency
min_df = 10
num_concepts = 100
# step 1 import old lyrical data into a dataframe.
lyric_df = pd.read_csv(practice_data_path)
lyric_df = lyric_df[lyric_df['genre'] != 'latin']
lyric_df = lyric_df[['track_name_x', 'artist_name_x', 'track_id','lyric_raw']]
lyric_df.rename(columns={
    'track_name_x': 'track_name',
    'artist_name_x': 'artist_name'
}, inplace=True)
lyric_df = lyric_df.dropna()
# replace new line character
lyric_df['lyric_raw'].replace('\n', ' ',regex=True, inplace=True)
# remove embed text from lyric genius API
lyric_df['lyric_raw'].replace('[0-9]{1,3}Embed', '', regex=True, inplace=True)
# create vectorizer object
vecObj = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
# fit the TFIDF vectorizer
docTermMat = vecObj.fit_transform(lyric_df['lyric_raw'])



In [92]:
lsiObj = TruncatedSVD(n_components=num_concepts, random_state=42)
docVecs = lsiObj.fit_transform(docTermMat)
# create a dataframe where the track id is the index the docVecs are the rows.
track_vec_dict = defaultdict(list)
track_ids = lyric_df['track_id'].values
track_vec_dict = {track_ids[i]: docVecs[i] for i in range(len(track_ids))}



In [84]:
# part 2 convert the query into a vector in the concept space
user_query = 'Jealousy, turning saints into the sea Swimming through sick lullabies Choking on your alibis'
# vectorize
userVec = vecObj.transform([user_query])
# convert query vec into the concept space
userLsi = lsiObj.transform(userVec)


In [85]:
# part 3 execute search using cosine similarity
# 1. Find the cosine similarity between the query and all lyrics
# 2. Sort the tracks by similarity
# 3. Return the top N tracks to the user.

# calculate cosine similarity between every track and the lyric provided.
simVals = cosine_similarity(docVecs, userLsi)
# create a track name, track id, artist name, similarity dataframe
lyric_df['similarity'] = simVals

# this step is important, 
# the lyrics df is officially out of sync now, 
# the indexes need to be sorted again OR two copies need to be maintained
sim_df = lyric_df.sort_values(by='similarity', ascending=False)
user_playlist = sim_df.head(30)[['track_name', 'artist_name', 'track_id']]
# initialize a feedback column and set every row to 0.
user_playlist['feedback'] = 0


In [86]:
# simulate user input here
feedback = [np.random.randint(0,3) for i in range(len(user_playlist))]
user_playlist['feedback'] = feedback

In [104]:
# PROCESS 4 Rochhio Feedback Filtering
# 1. Group user feeback by love, nuetral, dislike
# 2. Calculate the mean for each group
# 3. Apply alpha, beta, gamma, and phi to:
#       Original search, loves, hates, nuetral
# 4. Update the lyric search querry vector and return new results!

# tuning parameters
# original querry gets no penalty
alpha = 1.0
# loved songs get a beta positive weight.
beta = 0.75
# disliked songs get a gamma negative weight
gamma = 0.25
# nuetral songs get a phi positive weight
phi = 0.5

# create a mean vector dict for all 3 states
meanVectDict = defaultdict(list)

# iterate through the three states nuetral[0], dislike[1], love[2]
for i in range(3):
    temp_tracks = user_playlist[user_playlist['feedback']==i]
    if len(temp_tracks) > 0:
        # this means that tracks with this sentiment exist.
        # we can go get the track vectors from the track_vec_dict
        tempVecs = [track_vec_dict[vec] for vec in temp_tracks['track_id']]
        # next we need to calculate the mean vector for this segment
        meanVec = np.mean(tempVecs, axis=0)
        # add this mean to the mean vect dict. The key is the state.
        meanVectDict[i] = meanVec
    else:
        # if there are no tracks in this state, set its mean to 0
        meanVectDict[i] = 0

# calcualte the new query vector by summing all of the mean vectors together
newQueryVec = alpha*userLsi + beta * meanVectDict[2] - gamma * meanVectDict[1] + phi * meanVectDict[0]

        


In [105]:
# STEP 5 convert the notebook into functions and create a loop!