## Import Libraries

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [77]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [78]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [79]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

## Dataset Overview

In [80]:
df = pd.read_csv('/content/spotify_millsongdata.csv')
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [81]:
df.tail()

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [82]:
df.shape

(57650, 4)

In [83]:
df.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


## Drop 'link' column

In [84]:
df = df.sample(10000).drop('link', axis=1).reset_index(drop=True)

### Check data again

In [85]:
df.head(10)

Unnamed: 0,artist,song,text
0,Joni Mitchell,Sometimes I'm Happy,Sometimes I'm happy \r\nSometimes I'm blue \...
1,Cake,Pentagram,Your pentagram is down below our floor. \r\nY...
2,Noa,We Can Work It Out,"Try to see it my way, \r\nDo I have to keep o..."
3,Bosson,Summer With You,Do you remember our place where we used to mee...
4,Train,Umbrella,You have my heart \r\nAnd we'll never be worl...
5,Cat Stevens,Come On And Dance,I take you out all over town \r\nBut you alwa...
6,Loretta Lynn,I've Never Been This Far Before,I can almost hear the stillness \r\nAs it yel...
7,Freddie King,You Were Wrong,"You were wrong, you were wrong \r\nYou were w..."
8,Ne-Yo,The Plan,Spoken \r\n \r\nHow's everybody doin? Hope y...
9,Conway Twitty,Dream Maker,You are my dream maker \r\nTaking me places I...


### 'text' overview

In [86]:
df["text"][0]

"Sometimes I'm happy  \r\nSometimes I'm blue  \r\nMy disposition depends on you  \r\nI never mind the rain from the skies  \r\nAs long as I have the sun in your eyes  \r\n  \r\nSometimes I love you  \r\nSometimes I hate you  \r\nWhen I hate you  \r\nIt's because I love you  \r\nThat's how I am  \r\nSo what can I do  \r\nI'm happy when I'm with you  \r\n  \r\nSometimes I'm happy  \r\nSometimes I'm blue  \r\nMy disposition depends on you  \r\nI never mind the rain from the skies  \r\nAs long as I have the sun in your eyes  \r\n  \r\nSometimes I love you  \r\nSometimes I hate you  \r\nWhen I hate you  \r\nIt's because I love you  \r\nThat's how I am  \r\nSo what can I do  \r\nI'm happy when I'm with you  \r\n  \r\nI'm happy when I'm with you  \r\nI'm happy when I'm with you  \r\nI'm happy when I'm with you  \r\nI'm happy when I'm with you\r\n\r\n"

## Take 5000 sample

In [87]:
df.head()

Unnamed: 0,artist,song,text
0,Joni Mitchell,Sometimes I'm Happy,Sometimes I'm happy \r\nSometimes I'm blue \...
1,Cake,Pentagram,Your pentagram is down below our floor. \r\nY...
2,Noa,We Can Work It Out,"Try to see it my way, \r\nDo I have to keep o..."
3,Bosson,Summer With You,Do you remember our place where we used to mee...
4,Train,Umbrella,You have my heart \r\nAnd we'll never be worl...


In [88]:
df.shape

(10000, 3)

## Text Cleaning / Text Preprocessing

In [90]:
df["text"] = df["text"] = df["text"].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex=True)

In [91]:
df["text"]

Unnamed: 0,text
0,sometimes i'm happy \r sometimes i'm blue \r...
1,your pentagram is down below our floor. \r yo...
2,"try to see it my way, \r do i have to keep on..."
3,do you remember our place where we used to mee...
4,you have my heart \r and we'll never be world...
...,...
9995,"well, i got a bumper sticker \r on the back o..."
9996,"from wi push di club door \r bombs away, part..."
9997,"i said, ""take it easy, baby \r i worked all d..."
9998,"there are places i remember \r all my life, t..."


In [92]:
df.tail(5)

Unnamed: 0,artist,song,text
9995,George Strait,Honk If You Honky Tonk,"well, i got a bumper sticker \r on the back o..."
9996,Vybz Kartel,Di Way We Roll,"from wi push di club door \r bombs away, part..."
9997,Elvis Presley,"Bossa Nova, Baby","i said, ""take it easy, baby \r i worked all d..."
9998,Rod Stewart,In My Life,"there are places i remember \r all my life, t..."
9999,Arlo Guthrie,Me And My Goose,me and my goose \r me and my pal \r we had s...


In [93]:
def token(txt):
    nltk.download('punkt')
    token = nltk.word_tokenize(txt)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [94]:
token("you are beautiful")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'you are beauti'

In [95]:
df["text"].apply(lambda x: token(x))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /root/

Unnamed: 0,text
0,sometim i 'm happi sometim i 'm blue my dispos...
1,your pentagram is down below our floor . your ...
2,"tri to see it my way , do i have to keep on ta..."
3,do you rememb our place where we use to meet d...
4,you have my heart and we 'll never be world ap...
...,...
9995,"well , i got a bumper sticker on the back of m..."
9996,"from wi push di club door bomb away , parti li..."
9997,"i said , `` take it easi , babi i work all day..."
9998,"there are place i rememb all my life , though ..."


In [96]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [97]:
matrix = tfid.fit_transform(df["text"])
matrix

<10000x34124 sparse matrix of type '<class 'numpy.float64'>'
	with 540133 stored elements in Compressed Sparse Row format>

In [98]:
similar = cosine_similarity(matrix)

In [99]:
similar[0]

array([1.        , 0.00474789, 0.0165744 , ..., 0.        , 0.00947887,
       0.        ])

In [100]:
similar.shape

(10000, 10000)

In [102]:
df[df["song"] == "Me And My Goose" ].index[0]

9999

## Recommender Function

In [103]:
def recommender(song_name):
  idx = df[df["song"] == song_name].index[0]
  distance = sorted(list(enumerate(similar[idx])), reverse=True, key=lambda x:x[1])
  song = []
  for s_id in distance[1:5]:
    song.append(df.iloc[s_id[0]].song)
  return song

In [104]:
recommender('Me And My Goose')

['Cry Of The Wild Goose', 'What Do You Do?', 'Night And Day', 'Flying South']

In [106]:
import pickle
pickle.dump(similar,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))