In [11]:
import pandas as pd

In [12]:
data = pd.read_csv('spotify_millsongdata.csv')

In [13]:
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [14]:
data.shape

(57650, 4)

In [15]:
data.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [16]:
data = data.sample(5000).drop('link',axis=1).reset_index(drop=True)

In [17]:
data.tail()

Unnamed: 0,artist,song,text
4995,Hanson,Speechless,Everybody wonders and some people know \r\nI ...
4996,Dewa 19,Sayap-Sayap Patah,Angin tolonglah aku sedang jatuh cinta \r\nTa...
4997,Bing Crosby,On A Slow Boat To China,I'd like to get you on a slow boat to China \...
4998,Billie Holiday,God Bless The Child,Them that's got shall have \r\nThem that's no...
4999,Hillsong United,Go,"We're giving it all away, away \r\nGiving it ..."


In [18]:
data['text'][0]

"If you've ever had  \r\nA broken heart  \r\nYou promise yourself  \r\nTo never let it happen again  \r\nDdon't wanna think about it  \r\nDon't wanna dream about it  \r\nBut it finds  \r\nIt's way back into your head  \r\n  \r\nThe i love you' 's  \r\nThe i need you' 's  \r\nThey're only words  \r\nThat people say  \r\nThey're just words  \r\nWhen they're hand in hand  \r\nWith the games that people play  \r\n  \r\nIf i never see  \r\nThe top of a mountain  \r\nIf i never set  \r\nOne foot in the sea  \r\nI promise you  \r\nI'll love you forever  \r\nYou'll always be  \r\nThe biggest part of me  \r\n  \r\n[Chorus:]  \r\nWherever you go  \r\nWhatever you do  \r\nI will follow  \r\nYou've changed the way  \r\nThat i need to think  \r\nIn promise you  \r\nI'll love you forever  \r\nYou'll always be  \r\nThe biggest part of me  \r\n  \r\nTake a look at me  \r\nI let you have my heart  \r\nI swore i'd never let it  \r\nHappen again  \r\nI'm not mad about it  \r\nBut i know on you girl  \r\n

In [19]:
data.shape

(5000, 3)

# data preprocessing

In [20]:
data['text'] = data['text'].str.lower().replace(r'^\w\s','').replace(r'\n','',regex=True)

In [21]:
import nltk
from nltk.stem.porter import PorterStemmer

# Tokenization

In [22]:
stemmer = PorterStemmer()

In [23]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [24]:
data['text'].apply(lambda x: token(x))

0       if you 've ever had a broken heart you promis ...
1       plead with my saint wash hi hand and feet find...
2       yeah hi guitar swung across hi back hi dusti b...
3       there 's a littl ditti they 're singin ' in th...
4       [ choru : ] just got me a crib and a new car t...
                              ...                        
4995    everybodi wonder and some peopl know i guess t...
4996    angin tolonglah aku sedang jatuh cinta tapi ak...
4997    i 'd like to get you on a slow boat to china a...
4998    them that 's got shall have them that 's not s...
4999    we 're give it all away , away give it all to ...
Name: text, Length: 5000, dtype: object

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
tfid = TfidfVectorizer(analyzer='word',stop_words='english')

In [27]:
matrix = tfid.fit_transform(data['text'])

In [28]:
similar = cosine_similarity(matrix)

In [29]:
similar[1]

array([0.051735  , 1.        , 0.01210809, ..., 0.01360384, 0.01923144,
       0.00926025])

In [30]:
data[data['song']== 'I Can Only Give You Everything'].index[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

# Create Recommender Function

In [31]:
def recommender(song_nam):
    idx = data[data['song']== song_nam].index[0]
    distance = sorted(list(enumerate(similar[idx])),reverse=True ,key = lambda x:x[1])
    song = []
    for s_id in distance[1:21]:
        song.append(data.iloc[s_id[0]].song)
    return song

In [32]:
recommender('Shine')

['Shine',
 'We Are',
 'Let Creation Sing',
 'Light From Your Lighthouse',
 'Hope Of The Broken World',
 'Glory',
 'Do What I Say',
 'King Of All',
 "The Sun Ain't Gonna Shine Anymore",
 'Let Your Soul Shine',
 'Church On Fire',
 "Jesus Don't Give Up On Me",
 'Rise Above',
 'Fairest Lord Jesus',
 'The Last Jesus',
 'God Of Wonders',
 'Shine, Shine, Shine',
 "You're Everything",
 'Go',
 'Never Be The Sun']

In [33]:
import pickle

In [34]:
pickle.dump(similar, open('similarity', 'wb'))

In [35]:
pickle.dump(similar,open('similarity','wb'))