In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spotify_millsongdata.csv")

In [3]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [5]:
df.shape

(57650, 4)

In [6]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [7]:
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)

In [8]:
df['text'][0]

"I just don't know what to do tonight,  \r\nMy head is aching as I drink and breathe  \r\nMemory falls like cream in my bones, moving on my own.  \r\n  \r\nThere must be something I can dream tonight,  \r\nThe air is filled with the moves of you,  \r\nAll the fire is frozen yet still I have the will, ooh, ah.  \r\n  \r\nTrumpets, violins, I hear them in the distance  \r\nAnd my skin emits a ray, but I think it's sad, it's much too bad  \r\nThat our friends can't be with us today\r\n\r\n"

In [9]:
df.shape

(5000, 3)

In [10]:
# text preprocessing
df['text'] = df['text'].str.lower().replace(r'^\W\S',' ').replace(r'\n',' ',regex = True)


In [11]:
df.tail(5)

Unnamed: 0,artist,song,text
4995,Imago,Idlip,neither awake or asleep \r dwell somewhere in...
4996,Paul McCartney,Home,"evening brings the close of day, \r skies of ..."
4997,Everclear,Invisible,i will i will live for a year or two \r maybe...
4998,Nick Cave,Let The Bells Ring,"c'mon, kind sir, let's walk outside \r and br..."
4999,Hank Snow,Chattanooga Choo Choo,pardon me boy is this the chattanooga choo cho...


In [12]:
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
stemmer = PorterStemmer()


In [14]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a =[stemmer.stem(W) for W in token]
    return "  ".join(a)


In [15]:
token("you are beautiful, beauty")

'you  are  beauti  ,  beauti'

In [16]:
df['text'].apply(lambda x: token(x))

0       i  just  do  n't  know  what  to  do  tonight ...
1       late  i  've  been  think  how  much  i  miss ...
2       call  out  to  prove  my  love  you  repli  by...
3       (  merl  travi  )  come  and  listen  you  fel...
4       well  there  's  news  fli  'round  on  the  r...
                              ...                        
4995    neither  awak  or  asleep  dwell  somewher  in...
4996    even  bring  the  close  of  day  ,  sky  of  ...
4997    i  will  i  will  live  for  a  year  or  two ...
4998    c'mon  ,  kind  sir  ,  let  's  walk  outsid ...
4999    pardon  me  boy  is  thi  the  chattanooga  ch...
Name: text, Length: 5000, dtype: object

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
tfid = TfidfVectorizer(analyzer='word' , stop_words='english')

In [19]:
matrix = tfid.fit_transform(df['text'])

In [20]:
similer = cosine_similarity(matrix)

In [21]:
similer[0]

array([1.        , 0.02191426, 0.00982325, ..., 0.03127791, 0.02892103,
       0.00966963])

In [22]:
idx = df[df['song']=='Give You Up'].index[0] if not df[df['song']=='Give You Up'].empty else None

In [23]:
def recommender(song_name):
    try:
        idx = df[df['song'] == song_name].index[0]
    except IndexError:
        return []
    return [df.iloc[s_id[0]].song for s_id in sorted(list(enumerate(similer[idx])), reverse=True, key=lambda x: x[1])[1:5]]

In [24]:
recommender("Give You Up")


[]

In [25]:
import pickle

In [26]:
pickle.dump(similer, open('similarity', 'wb'))

In [27]:
pickle.dump(df, open('df', 'wb'))