In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape


(5000, 3)

In [6]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [7]:
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens] 
    return " ".join(stemming)

[nltk_data] Downloading package punkt to C:\Users\Abhijit
[nltk_data]     Deb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [10]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [11]:
similarity[0]

array([1.        , 0.11024016, 0.03482332, ..., 0.05790715, 0.02854179,
       0.00764617])

In [12]:
df['song'][0]

'Practical Arrangement'

In [13]:
df[df['song']=='Practical Arrangement']

Unnamed: 0,artist,song,text
0,Sting,Practical Arrangement,am i ask for the moon ' is it realli so implau...


Recommendation Function

In [14]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [16]:
recommendation("Practical Arrangement")

['Here Comes The Moon',
 'Live And Learn',
 'The Way You Look Tonight',
 'Spell Of Desire',
 'The More We Try',
 'Do I Have To Dance All Night',
 'I Thought It Took A Little Time',
 'When Are You Ever Gonna Learn',
 'Love song',
 'Higher Ground',
 'That Lonesome Road',
 'How High The Moon',
 'All This Time',
 'Once In A Very Blue Moon',
 'If It Were You',
 'Forever',
 'Only You Know',
 'Show Me A Smile',
 'On Any Given Night',
 'The Boy With A Moon And Star On His Head']

In [17]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))