In [1]:
import re
import pickle
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import spacy

# import spacy model en_core_web_md
nlp = spacy.load('en_core_web_md')

In [2]:
df = pd.read_csv('../data/spotify_songs.csv')

In [3]:
def clean_data(songs):
    songs = songs.copy()
    songs = songs.dropna()
    songs = songs[songs['language']=='en']
    songs = songs.drop_duplicates(subset = ['track_name', 'track_artist'])
    songs = songs[['track_name', 'track_artist', 'lyrics']].reset_index()
    return songs.drop(columns='index')
    
# Have a clean data frame
spotify = clean_data(df)

In [4]:
spotify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13726 entries, 0 to 13725
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   track_name    13726 non-null  object
 1   track_artist  13726 non-null  object
 2   lyrics        13726 non-null  object
dtypes: object(3)
memory usage: 321.8+ KB


In [9]:
spotify.to_csv('trimmed_spotify.csv', index=False)

In [5]:
# YOUR CODE HERE
def clean_text(text):
    """
    Takes in text and returns a clean text which mean
    Non-alphabet are filtered out. 
    Replace multi white spaces with single white space
    """
    
    # order of operations - apply the expression from top to bottom
    non_alpha = '[^a-zA-Z]' #[\u4e00-\u9fa5_] to detect chinese characters
    multi_white_spaces = "[ ]{2,}"

    text = re.sub(non_alpha, ' ', text)
    text = re.sub(multi_white_spaces, " ", text)
    
    # apply case normalization 
    return text.lower().strip()

In [6]:
spotify.head()

Unnamed: 0,track_name,track_artist,lyrics
0,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu..."
1,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U..."
2,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...
3,Dumb Litty,KARD,Get up out of my business You don't keep me fr...
4,Soldier,James TW,"Hold your breath, don't look down, keep trying..."


In [7]:
# Add new clean_lyrics col
spotify['clean_lyrics'] = spotify['lyrics'].apply(clean_text)

In [8]:
def tokenize(document):
    """
    Takes a doc and returns a list of tokens in the form of lemmas.
    Stop words and punctuation are filtered out. 
    """
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [9]:
# Instantiate vectorizer object
tfidf_vect = TfidfVectorizer(tokenizer=tokenize, min_df=0.03, 
                        max_df=0.3)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf_vect.fit_transform(spotify['clean_lyrics'])

# View Feature Matrix as DataFrame
dtm = pd.DataFrame(data=dtm.toarray(), columns=tfidf_vect.get_feature_names())
dtm

KeyboardInterrupt: 

In [19]:
# Instantiate vectorizer object
tfidf_vect = TfidfVectorizer(stop_words='english')

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf_vect.fit_transform(spotify['lyrics'].copy())

# View Feature Matrix as DataFrame
dtm = pd.DataFrame(data=dtm.todense(), columns=tfidf_vect.get_feature_names())
dtm

Unnamed: 0,act,afraid,ah,ain,air,alive,alright,apart,arms,ask,...,work,world,worth,wouldn,wrong,ya,years,yes,yo,young
0,0.0,0.0,0.0,0.0,0.0,0.435875,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.060992,0.000000,0.0,0.143036,0.000000,0.0,0.0,0.226545,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.078751,0.370283,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13721,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
13722,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000
13723,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.048772,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.360268
13724,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.149858,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [12]:
dtm_filename = 'nlp_dtm.pkl' 
pickle.dump(dtm_new, open(dtm_filename, 'wb'))

In [20]:
dtm_filename = 'nlp_dtm_joblib' 
joblib.dump(dtm, open(dtm_filename, 'wb'))

In [None]:
dtm_filename = 'nlp_dtm.pkl'

dtm = pickle.load(open(dtm_filename, 'rb'))

In [None]:
# Fit on DTM
nn = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(dtm)

In [None]:
model_filename = 'nlp_model.pkl' 
pickle.dump(nn, open(model_filename, 'wb'))

In [None]:
def favorite_track(name, artist):
    # find the song index
    selected_song_index = spotify[(spotify['track_name']==name) & (spotify['track_artist']==artist)].index.tolist()
    selected_song = [dtm.iloc[selected_song_index[0]].values] # selected_song_index return a list of 1
    _, neigh_index = nn.kneighbors(selected_song)
    song_list = []
    for i in neigh_index:
#         print(f"Song: {spotify['track_name'][i]} by {spotify['track_artist'][i]}")
        print("Song: {} by {}".format(spotify['track_name'][i], spotify['track_artist'][i]))
        song_list.append(text)
    return song_list

In [None]:
spotify['track_name'][6512]

In [None]:
doc = [dtm.iloc[a[0]].values]

# Query Using kneighbors 
neigh_dist, neigh_index = nn.kneighbors(doc)

In [None]:
neigh_index

In [None]:
favorite_track('I Feel Alive', 'Steady Rollin')

In [None]:
a

In [None]:
f"Song: {spotify['track_name'][0]} by {spotify['track_artist'][0]}"