In [43]:
import pandas as pd

In [45]:
ds = pd.read_csv("spotify_millsongdata.csv")
ds

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...


In [46]:
#Check if data_set has empty values
ds.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [49]:
#Remove link as it is not nessary for our Spotify Web API
# For the sake of running the code, sampling 10% of the data is manageable for laptops
ds = ds.sample(5000).drop("link", axis=1).reset_index(drop=True)
ds

Unnamed: 0,artist,song,text
0,Doors,The Wasp,I wanna tell you 'bout texas radio and the big...
1,Erasure,Here I Go Impossible Again,"How many dreams am I looking for in your eyes,..."
2,Kiss,Rock Hard,"Hot lover, turn up the heat \r\nI want your s..."
3,Radiohead,Wish You Were Here,"So, so you think you can tell \r\nHeaven from..."
4,Everclear,Tv Show,I walk around in the market \r\nLate at night...
...,...,...,...
4995,Vanessa Williams,I'll Be Home For Christmas,I'll be home for christmas \r\n \r\nI'll be ...
4996,Erasure,Take Me Back,The way to your heart \r\nThe way to your hea...
4997,Blur,"Sunday, Sunday","Sunday, Sunday, here again in tidy attire \r\..."
4998,The Beatles,Bad To Me,"If you ever leave me, I'll be sadand blue \r\..."


In [51]:
#Text Cleaning  
ds["text"] = ds["text"].str.lower().replace(r'^\w\s', ' ', regex = True).replace(r'\n', ' ', regex = True)
ds

Unnamed: 0,artist,song,text
0,Doors,The Wasp,wanna tell you 'bout texas radio and the big ...
1,Erasure,Here I Go Impossible Again,"how many dreams am i looking for in your eyes,..."
2,Kiss,Rock Hard,"hot lover, turn up the heat \r i want your su..."
3,Radiohead,Wish You Were Here,"so, so you think you can tell \r heaven from ..."
4,Everclear,Tv Show,walk around in the market \r late at night ...
...,...,...,...
4995,Vanessa Williams,I'll Be Home For Christmas,i'll be home for christmas \r \r i'll be ho...
4996,Erasure,Take Me Back,the way to your heart \r the way to your hear...
4997,Blur,"Sunday, Sunday","sunday, sunday, here again in tidy attire \r ..."
4998,The Beatles,Bad To Me,"if you ever leave me, i'll be sadand blue \r ..."


In [54]:
import nltk
from nltk.stem.porter import PorterStemmer

In [56]:
stemmer = PorterStemmer()

In [58]:
def token(txt):
    # Seperates words into an array; For example "I was running" -> tokens = ["I","was","running"]
    tokens = nltk.word_tokenize(txt)
    # Obtains the root word; For example if w was "running", "runs" "ran", they would all change to run,
    a = [stemmer.stem(w) for w in tokens]
    # Makes lists into sentence
    return " ".join(a)

In [60]:
#example of token operation, changes each sentence to contain only root words, useful for NLP
token("you are beautiful")

'you are beauti'

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [64]:
# Now apply function onto the text of all the songs
ds["text"].apply(lambda x: token(x))

0       wan na tell you 'bout texa radio and the big b...
1       how mani dream am i look for in your eye , in ...
2       hot lover , turn up the heat i want your sugar...
3       so , so you think you can tell heaven from hel...
4       walk around in the market late at night i walk...
                              ...                        
4995    i 'll be home for christma i 'll be home for c...
4996    the way to your heart the way to your heart th...
4997    sunday , sunday , here again in tidi attir you...
4998    if you ever leav me , i 'll be sadand blue do ...
4999    excus me pleas your majesti i chanc thi moment...
Name: text, Length: 5000, dtype: object

In [65]:
# TF = Term Frequency; IDF = Inverse Document Frequency; 
# Essentially it is weighing each word based on frequency; Stop_words are words for grammatical structure, which are not 
# Important for our case, such as "and" or "the";
tfidf = TfidfVectorizer(analyzer="word", stop_words="english")

In [66]:
# Creates a matrix with words and frequency across different lyrics
Matrix = tfidf.fit_transform(ds["text"])

In [67]:
# Computes the angle between all pairs of vectors, vector positions are derived from how similar two lyrics using TF-IDF scores
similarity = cosine_similarity(Matrix)

In [71]:
# Example of how we can get the indexes and therefore data points, this will be implemented in our recommender function
ds[ds['song'] == "Bad To Me"].index[0]

4998

In [85]:
# Recommender function takes in a song 
def recommender(song_name):
    # Grabs an index of a song as mentioned before
    i = ds[ds["song"]==song_name].index[0]
    # Gets the similarity between two vectors based on distance between two song's lyrics 
        # Example: i = "Bad Romance"
        # This will list out a sorted list of songs similar to "Bad Romance" (inclusive) in descending order"
        # [(0, 0.95), (1, 0.85), (2, 0.75), ...] -> [(3, 1.0), (0, 0.95), (1, 0.85), (4, 0.84), ...] 
    similar = sorted(list(enumerate(similarity[i])), reverse=True, key = lambda x:x[1])
    song = []
    # We don't need index 0 because that is the original song itself with a score of 1
    for song_id in similar[1:5]:
        song.append(ds.iloc[song_id[0]].song)
    return song

In [87]:
#Test:
recommender("Bad To Me")

['Love Her All I Can', 'Bad Boy', 'Born Bad', 'If You Knew What I Know']

In [89]:
import pickle

In [97]:
pickle.dump(similarity, open("similarity.pkl", "wb"))

In [98]:
pickle.dump(ds, open("ds.pkl","wb"))