In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spotify.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [4]:
df =df.sample(30000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,artist,song,text
0,Madonna,Just A Dream,[Chorus:] \r\n \r\nWas it just a dream? \r\...
1,Roxette,The Weight Of The World,Sunday morning \r\nI'm still hiding in bed \...
2,Barbra Streisand,Between Yesterday And Tomorrow,"Between yesterday and tomorrow, \r\nThere is ..."
3,System Of A Down,Fortress,Edmund Kemper solved it all \r\nHe foolded th...
4,The Killers,Sweet Talk,Lift me up on my honour \r\nTake me over this...


In [6]:
df.shape

(30000, 3)

### Text cleaning/ Test preprocessing

In [7]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex=True)

In [8]:
import nltk
from nltk.stem.porter import PorterStemmer

In [9]:
stemming = PorterStemmer()

In [10]:
def token(txt):
    token = nltk.word_tokenize(txt)
    a = [stemming .stem(w) for w in token]
    return " ".join(a)


In [11]:
# Apply the token function to the 'text' column
df['text'] = df['text'].apply(token)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [14]:
tfid = tfid.fit_transform(df['text'])

In [15]:
similar = cosine_similarity(tfid)

In [16]:
similar[0]

array([1.        , 0.04241543, 0.06227999, ..., 0.06325866, 0.05624062,
       0.08815551], shape=(30000,))

In [17]:
df[df['song'] == "Of Course I'm Lying"]

Unnamed: 0,artist,song,text


### Recommender function

In [18]:
def recommendor(song_name):
    song = df[df['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similar[song])), reverse=True, key=lambda x: x[1])
    song_list = []
    for s_id in distance[1:6]:
        song_list.append(df.iloc[s_id[0]].song)
    return song_list

In [20]:
recommendor("The Weight Of The World")

['Weight Of The World',
 'Sometimes',
 'Sometimes',
 'Hard To Believe',
 'Robot Boy']

In [21]:
import pickle

In [24]:
pickle.dump(similar, open('similarity.pkl', 'wb'))

In [25]:
pickle.dump(df, open('df.pkl', 'wb'))