## Import Package

In [1]:
import re
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
import spacy

# import spacy model en_core_web_md
nlp = spacy.load('en_core_web_md')

## Clean Data

In [2]:
df = pd.read_csv('../data/spotify_songs.csv')

In [3]:
def clean_data(songs):
    """
    Input DataFrame
    ---------
    Return the new clean Data
    """
    # Copy data frame so it won't alter the input
    songs = songs.copy()
    
    # Drop NA
    songs = songs.dropna()
    
    # Select song in English
    songs = songs[songs['language']=='en']
    
    # Drop dublicates song that have the same name and artist
    songs = songs.drop_duplicates(subset = ['track_name', 'track_artist'])
    
    # Subset the dataframe to only contain 3 columns. Reset index
    songs = songs[['track_name', 'track_artist', 'lyrics']].reset_index()
    
    return songs.drop(columns='index')
    
# Create a new clean Data Frame
spotify = clean_data(df)

In [4]:
spotify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13726 entries, 0 to 13725
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   track_name    13726 non-null  object
 1   track_artist  13726 non-null  object
 2   lyrics        13726 non-null  object
dtypes: object(3)
memory usage: 321.8+ KB


In [45]:
# Save the clean data into csv
spotify.to_csv('../data/trimmed_spotify.gz', index=False, compression='gzip')

In [6]:
def clean_text(text):
    """
    Takes in text and returns a clean text which mean
    Non-alphabet are filtered out. 
    Replace multi white spaces with single white space
    """
    
    # order of operations - apply the expression from top to bottom
    non_alpha = '[^a-zA-Z]'
    multi_white_spaces = "[ ]{2,}"

    text = re.sub(non_alpha, ' ', text)
    text = re.sub(multi_white_spaces, " ", text)
    
    # apply case normalization 
    return text.lower().strip()

In [8]:
# Add new clean_lyrics col
spotify['clean_lyrics'] = spotify['lyrics'].apply(clean_text)

In [9]:
spotify.head()

Unnamed: 0,track_name,track_artist,lyrics,clean_lyrics
0,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",the trees are singing in the wind the sky blue...
1,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",na yeah spyderman and freeze in full effect uh...
2,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,i really can t stay baby it s cold outside i v...
3,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,get up out of my business you don t keep me fr...
4,Soldier,James TW,"Hold your breath, don't look down, keep trying...",hold your breath don t look down keep trying d...


## Model

In [10]:
def tokenize(document):
    """
    Takes a doc and returns a list of tokens in the form of lemmas.
    Stop words and punctuation are filtered out. 
    """
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [12]:
# Instantiate vectorizer object
tfidf_vect = TfidfVectorizer(tokenizer=tokenize, min_df=0.03, 
                        max_df=0.28)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf_vect.fit_transform(spotify['clean_lyrics'])

# View Feature Matrix as DataFrame
dtm = pd.DataFrame(data=dtm.todense(), columns=tfidf_vect.get_feature_names())
dtm

Unnamed: 0,act,afraid,ah,ain,air,alive,alright,apart,arm,ask,...,wouldn,write,wrong,x,y,ya,year,yes,yo,young
0,0.0,0.0,0.0,0.0,0.0,0.335022,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.140011,0.0,0.0,0.000000,0.0,0.0,0.223404,0.000000
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13721,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
13722,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000
13723,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.270345
13724,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.147579,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000


In [37]:
# Fit on DTM
nn = NearestNeighbors(n_neighbors=6, algorithm='auto').fit(dtm)

In [38]:
model_filename = '../ml/nlp_model' 
joblib.dump(nn, open(model_filename, 'wb'))

In [43]:
def favorite_track(name, artist):
    """
    Receive the name and artist of a song then return the 5 most similar song
    --------
    Input
    Return
    """
    # find the song index
    selected_song_index = spotify[(spotify['track_name']==name) & (spotify['track_artist']==artist)].index.tolist()
    selected_song = [dtm.iloc[selected_song_index[0]].values] # selected_song_index return a list of 1
    _, neigh_index = nn.kneighbors(selected_song)
    print(neigh_index)
    song_list = []
    for i in neigh_index[0][1:]:
        song_list.append(f"{spotify['track_name'][i]} by {spotify['track_artist'][i]}")
    return song_list

In [44]:
favorite_track('Dumb Litty', 'KARD')

[[    3  1390  4559 12637  1745 11776]]


['Gyal You A Party Animal - Remix by Charly Black',
 '1999 by Prince',
 'We like to Party! (Six Flags) by Vengaboys',
 'Not Your Birthday by Allstar Weekend',
 'Party All the Time by Eddie Murphy']