In [1]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv('songdata.csv')
df.sample(3)

Unnamed: 0,artist,song,link,text
15313,Overkill,Fast Junkie,/o/overkill/fast+junkie_20201909.html,"Lean in, \nTake it right. \nDown one, \nNo ..."
39823,Kelly Family,You Belong To Me,/k/kelly+family/you+belong+to+me_20351333.html,\n \nIf you please \nIf you want \nWon't ...
44511,Moody Blues,Top Rank Suite,/m/moody+blues/top+rank+suite_20095817.html,Rain on the river turns the torrent to a flood...


In [3]:
df.shape

(57650, 4)

In [4]:
df = df.sample(n=10000).drop('link', axis=1).reset_index(drop=True)

## Using Regular expression to  use words and white spaces and remove everthing else.

In [5]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [6]:
df['text'][0]

"she don't worry about tomorrow   nothing further from her mind   she don't need to beg or borrow   time is on her side   she like coloring the rainbow   never paint between the lines   she like to chase away the shadow   watch them run and hide      kiss me in the morning   before i open up my eyes   would you kiss me in the morning   sunrise      see her dancing on the water   washing dreams upon the shore   heaven wanting you to call her   knocking at your door   she my center of attention   nothing further from my mind   point me in the right direction   sunshine shine your light      would you kiss me in the morning   before i open up my eyes   would you kiss me in the morning   sunrise alright      sunrise kiss and i will follow   sunrise shine your light   sunrise filling up my hollow   sunrise sunrise   sunrise kiss and i will follow   sunrise shine your light   sunrise filling up my hollow   sunrise sunrise sunrise   she helped me find my way  "

In [7]:
#### using NLP nltk here in we are tokenizing our each word of text and creating a list of it and
#### then joining all words at last.

In [8]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


## What's punkt
###  the Punkt tokenizer can tokenize a given text into sentences by identifying the punctuation marks that denote the end of a sentence, such as periods, exclamation marks, and question marks.

In [9]:
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

[nltk_data] Downloading package punkt to C:\Users\Arpit
[nltk_data]     Akar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [13]:
similarity[0]

array([1.        , 0.02952985, 0.00704508, ..., 0.02087391, 0.01180707,
       0.04368307])

In [14]:
sorted(list(enumerate(similarity[1])),reverse=True,key=lambda x:x[1])

[(1, 1.0000000000000002),
 (2592, 0.5158825402168707),
 (20, 0.45910535769316796),
 (734, 0.41927861783588793),
 (7165, 0.37590964562193513),
 (4163, 0.3750371539085892),
 (1200, 0.3643500313378625),
 (625, 0.3471916945392976),
 (9800, 0.3454567205367511),
 (1811, 0.34316927409933484),
 (4602, 0.333273371469073),
 (7520, 0.3108687870513716),
 (4223, 0.30769848581569215),
 (6156, 0.3066837363402574),
 (6774, 0.29876871350185724),
 (4668, 0.29831936769731776),
 (8360, 0.294294406436188),
 (4050, 0.28791910189182424),
 (4677, 0.2792994767879109),
 (1622, 0.27464644666814214),
 (4334, 0.26621318144516815),
 (2256, 0.2656620764131077),
 (9129, 0.259274642069934),
 (3569, 0.2585214629602737),
 (6406, 0.25798185586227673),
 (1870, 0.2561034637292875),
 (6811, 0.25580687211021147),
 (9507, 0.25099970852921316),
 (2255, 0.24837573282832864),
 (4768, 0.24613113570014297),
 (2787, 0.24506693108361918),
 (5788, 0.24485292901559053),
 (9672, 0.24454667486120926),
 (9149, 0.23913367133932192),
 (760

# RECCOMENDATION FUNCTION

In [15]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:6]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [25]:
df[df['song']=='Fast Junkie'].index[0]

15313

In [27]:
recommendation('Michael Picasso')

["Free Fallin'",
 "It's Gonna Be A Good Day",
 'True To You',
 'And Then...',
 "Leavin'"]

In [None]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('saregama.pkl','wb'))