In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import os

In [12]:
#track_df = pd.read_csv()
temp_path = os.getcwd()
root_path = temp_path.split('/napster_2')[0]
repo_path = '/napster_2/lyric_genius_api/practice_data.csv'
practice_data_path = root_path + repo_path




In [45]:
track_df = pd.read_csv(practice_data_path)
print(track_df.genre.unique())
rel_columns = ['track_id', 'track_name_x', 'artist_name_x']
track_df = track_df[rel_columns]
track_df.rename(columns={'track_id': 'spotify_id',
                        'track_name_x': 'track_name',
                        'artist_name_x': 'artist_name'
})
#track_df.sample(30, random_state=15)


['alt-rock' 'latin' 'country' 'classical' 'hip-hop' 'edm' 'heavy-metal']


Unnamed: 0,spotify_id,track_name,artist_name
0,2QjOHCTQ1Jl3zawyYOpxh6,Sweater Weather,The Neighbourhood
1,2K7xn816oNHJZ0aVqdQsha,Softcore,The Neighbourhood
2,5E30LdtzQTGqRvNd7l6kG5,Daddy Issues,The Neighbourhood
3,2iUmqdfGZcHIhS3b9E9EWq,Everybody Talks,Neon Trees
4,7zwn1eykZtZ5LODrf7c0tS,You Get Me So High,The Neighbourhood
...,...,...,...
7045,1XgpK29CGGjZnxPYkiRbh4,Hey DJ,CNCO
7046,5q2MjTDby29ZOEigCVV28a,In My Hood,South Park Mexican
7047,0XpMTExp5q4nLZZZ3msDGn,La Funka,Ozuna
7048,0ulsRBiciReng91DhfVT9D,Bebé,Ozuna


In [4]:
# steps for Rocchio Feedback Filter
# PROCESS 1 convert the raw lyrics into the concept space.
# 1. Create a TFIDF vectorizer
# 2. Create a document term matrix using TFIDF vec fit_transform using the raw lyrics. [I think there is an option to lemmatize here]
# 3. Complete latent semantic indexing using TruncatedSVD(num components = num comncepts, specifiy the random state)
# 4. Fit the document term matrix using TruncatedSVD.fit_transform. THESE ARE YOUR VECTORS FOR SIMILARITY SCORING

# PROCESS 2 convert the query into a vector
# 1. Convert querry into a raw string
# 2. Use the TFIDF vectorizer above to transform the querry
# 3. Use the LSI object above to convert the querry into the concept space.

# PROCESS 3 execute the search
# 1. Find the cosine similarity between the querry and all lyrics
# 2. Sort the tracks by similarity
# 3. Return the top N tracks to the user.

# PROCESS 4 Rochhio Feedback Filtering
# 1. Group user feeback by love, no answer, dislike
# 2. Calculate the mean for each group
# 3. Apply alpha, beta, gamma, and phi to:
#       Original search, loves, hates, nuetral
# 4. Update the lyric search querry vector and return new results!



In [47]:
# Tuning Parameters
# minimum document frequency
min_df = 10
num_concepts = 100
# step 1 import old lyrical data into a dataframe.
lyric_df = pd.read_csv(practice_data_path)
lyric_df = lyric_df[lyric_df['genre'] != 'latin']
lyric_df = lyric_df[['track_name_x', 'artist_name_x', 'track_id','lyric_raw']]
lyric_df.rename(columns={
    'track_name_x': 'track_name',
    'artist_name_x': 'artist_name'
}, inplace=True)
lyric_df = lyric_df.dropna()
# replace new line character
lyric_df['lyric_raw'].replace('\n', ' ',regex=True, inplace=True)
# remove embed text from lyric genius API
lyric_df['lyric_raw'].replace('[0-9]{1,3}Embed', '', regex=True, inplace=True)
# create vectorizer object
vecObj = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
# fit the TFIDF vectorizer
docTermMat = vecObj.fit_transform(lyric_df['lyric_raw'])



In [48]:
lsiObj = TruncatedSVD(n_components=num_concepts, random_state=42)
docVecs = lsiObj.fit_transform(docTermMat)

In [52]:
# part 2 convert the query into a vector in the concept space
user_query = 'Jealousy, turning saints into the sea Swimming through sick lullabies Choking on your alibis'
# vectorize
userVec = vecObj.transform([user_query])
# convert query vec into the concept space
userLsi = lsiObj.transform(userVec)


array([[ 7.11647273e-02, -6.09642547e-03,  6.04691377e-02,
        -1.44803986e-03, -3.18537253e-02,  3.90851881e-03,
         1.48029911e-03, -1.69040866e-02, -4.14844889e-02,
        -3.59045550e-03, -1.83534099e-02,  3.13040211e-02,
         4.44765655e-02, -1.43040740e-03, -2.23788814e-02,
         4.93484762e-02,  6.49732182e-03, -2.16652048e-02,
        -1.75695539e-02,  3.98033689e-02,  5.34023243e-02,
         3.41196114e-02, -5.59653883e-03,  2.79872286e-03,
        -2.71257728e-02,  2.35466887e-02,  1.96964389e-02,
        -1.10194644e-03,  1.08624196e-02, -5.95757807e-03,
         7.19548376e-04, -8.06209195e-04,  2.35384462e-02,
         3.26576890e-02, -8.88940659e-05,  5.37283415e-03,
        -2.23354021e-04,  5.07858378e-03,  5.04188369e-03,
        -2.38474616e-02,  1.18862912e-02, -2.08163744e-02,
         1.40020875e-02,  2.29551317e-02,  4.86153201e-03,
        -2.07103041e-02, -6.93358791e-03, -2.34872681e-02,
         7.75909846e-03,  2.45840012e-02, -7.40716399e-0