In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [5]:
df.shape

(5000, 3)

In [6]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [7]:
df['text'][0]

'as i go travelling down lifes highway   whatever course my fortunes may foretell   i shall not go alone on my way   for thou shall always be with me rydell      when i seek rest from worldly matters   in palace or in hovel i may dwell   and though my bed be silk or tatters   my dreams shall always be of thee rydell      through all the years, rydell   and tears, rydell   we give three cheers rydell for thee      through everything rydell   we cling, rydell   and sing, rydell to thee  '

In [11]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [12]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [16]:
similarity[0]

array([1.        , 0.00527499, 0.        , ..., 0.00607819, 0.        ,
       0.        ])

In [19]:
df[df['song']==''].index[0]

0

# recommedation function

In [39]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [40]:
recommendation('Alma Mater')

['O Love That Will Not Let Me Go',
 'Shall We Dance',
 'Churchill Speech',
 'Until Next Time',
 'The Lord Is My Sheperd',
 'God Bless The Child',
 'Mary Of Argyle',
 'O Little Town Of Bethlehem',
 'A Call To Harvest',
 'Christmas Secrets',
 'The Time Is Near',
 "All Tomorrow's Parties",
 'Fare Thee Well',
 'Dark Star',
 'As With Gladness Men Of Old',
 'As Much As Always',
 'From The Balcony',
 'May You Never Be Alone',
 'Alleluia! Sing To Jesus!',
 'America The Beautiful']

In [41]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))