In [1]:
import pandas as pd

df= pd.read_csv("spotify_millsongdata.csv")
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [2]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [3]:
df= df.sample(5000).drop("link", axis= 1).reset_index(drop= True)
df.head()

Unnamed: 0,artist,song,text
0,Velvet Underground,I Can't Stand It,It's hard being a man \r\nLiving in a garbage...
1,Owl City,Hey Anna,"Hey Anna, it's just a boy you won't understand..."
2,Roy Orbison,"Here Comes The Rain, Baby","Here comes the rain, baby \r\nGuess it's good..."
3,Dewa 19,Kirana,Kucoba memahami tempatku berlabuh \r\nTerdamp...
4,The Monkees,Come On In,"Come on in, \r\nYou sure look good to me. \r..."


In [4]:
df['text'][0]

"It's hard being a man  \r\nLiving in a garbage pail  \r\nMy landlady called me up, oh  \r\nShe tried to hit me with a mop  \r\n  \r\nI can't stand it any more more  \r\nI can't stand it any more more, oh  \r\nI can't stand it any more more  \r\n  \r\nBut if baby would just come back, it'd be all right  \r\nYeah, if baby would just come back, it would be all right  \r\nCome on, baby  \r\n  \r\nI live with thirteen dead cats  \r\nA purple dog that wears spats  \r\nThey're all living out in the hall  \r\nAnd I can't stand it any more  \r\n  \r\nI can't stand it any more more  \r\nI can't stand it any more more  \r\nI can't stand it any more more  \r\n  \r\nBut if baby would just come back, it'd be all right  \r\nOh, if baby would just come back, it would be all right  \r\nBe all right, now  \r\n  \r\nCome on, baby  \r\n  \r\nI'm tired of living all alone, yeah  \r\nNobody ever calls me on the phone  \r\nBut when things are getting better  \r\nI just play my music louder  \r\n  \r\nI can'

In [5]:
# For Simplicity we are taking a sample
# df= df.sample(5000)

In [6]:
df.head()

Unnamed: 0,artist,song,text
0,Velvet Underground,I Can't Stand It,It's hard being a man \r\nLiving in a garbage...
1,Owl City,Hey Anna,"Hey Anna, it's just a boy you won't understand..."
2,Roy Orbison,"Here Comes The Rain, Baby","Here comes the rain, baby \r\nGuess it's good..."
3,Dewa 19,Kirana,Kucoba memahami tempatku berlabuh \r\nTerdamp...
4,The Monkees,Come On In,"Come on in, \r\nYou sure look good to me. \r..."


In [7]:
df.shape

(5000, 3)

## Text Preprocessing/Cleaning

In [8]:
df['text']= df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n',' ', regex= True)       #a-ZA-Z0-9 = \w\s

In [9]:
# As there are many approaches for this recommendation system i.e. Content-Based Filtering, Collaborative Filtering, Hybrid....etc
# I'm gonna use the Content Based Filtering

import nltk
from nltk.stem.porter import PorterStemmer


In [10]:
stemmer = PorterStemmer()

In [11]:
def token(txt):
    token= nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [12]:
token("you are beautiful, beauty")

'you are beauti , beauti'

In [13]:
df['text'].apply(lambda x: token(x))

0       it 's hard be a man live in a garbag pail my l...
1       hey anna , it 's just a boy you wo n't underst...
2       here come the rain , babi guess it 's goodby a...
3       kucoba memahami tempatku berlabuh terdampar di...
4       come on in , you sure look good to me . come o...
                              ...                        
4995    let there be light , let there be joy let ther...
4996    `` and now the sever beat of a high school spa...
4997    thi might just be a wast of time , but there '...
4998    i wish my heart could talk to you like i do ev...
4999    love you so much , ca n't count all the way i ...
Name: text, Length: 5000, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')
matrix = tfid.fit_transform(df['text'])

In [16]:
matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 269264 stored elements and shape (5000, 24523)>

In [17]:
similar = cosine_similarity(matrix)
similar[0]

array([1.        , 0.00799601, 0.3301152 , ..., 0.01924517, 0.0222342 ,
       0.06109552])

## Recommender Function

In [18]:
df[df['song'] == "Do I Ever Cross Your Mind"].index[0]

np.int64(4987)

In [19]:
def recommender(song_name):
    idx = df[df['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse= True, key= lambda x:x[1])
    song = []
    for s_id in distance[1:21]:      # 1:21 is for how many lines needed to display
        song.append(df.iloc[s_id[0]].song)
    return song

In [20]:
recommender("Do I Ever Cross Your Mind")

['Sweet Love',
 "Takin' Me Back",
 'Touch And Go',
 'Love Is A Sweet Thing',
 'Fantasy',
 'Sweet Young Thing Like You',
 'Never',
 "I Don't Mind",
 "Cryin' Time",
 'How Long',
 'Long, Long Way To Go',
 'Darling Days',
 'Child Of Mine',
 'How Long Will I Love You',
 'I Can See It In Your Eyes',
 'At This Moment',
 'Oh My My',
 'Love',
 'Did You Know',
 "I'd Do It All Over Again"]

In [21]:
import pickle
pickle.dump(similar, open("similarity.pkl", "wb"))
pickle.dump(df, open("df.pkl", "wb"))