In [87]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [48]:
df = pd.read_csv('spotify_millsongdata.csv')

In [49]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [50]:
df.tail()

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [51]:
df.shape

(57650, 4)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [53]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [54]:
df = df.sample(5000).drop(['link'],axis=1).reset_index(drop=True)

In [55]:
df.head()

Unnamed: 0,artist,song,text
0,Westlife,Somebody Needs You,I'm only human \r\nSometimes I make mistakes ...
1,Ice Cube,Check Yo Self,Yeah! Word is bond! \r\nDas EFX in this ya kn...
2,Chaka Khan,Maybe Your Baby,"I'm feeling down, in some kinda lonely \r\nCu..."
3,Ne-Yo,Congratulations,[Verse 1] \r\nOne of my best girls went and g...
4,Michael Jackson,Here I Am,I can't believe that is real \r\nThe way that...


In [56]:
df['text'][1]

'Yeah! Word is bond!  \r\nDas EFX in this ya know what I\'m sayin\'  \r\nStraight from the sewer word is bond!  \r\nYeah! Yea! Ah yeah! We doin\' this with my nigga  \r\nWhere my nigga? Ice Cube in the motherfucker  \r\nWord is bond! (Yeah!)  \r\n  \r\nYou better check yo self before you wreck yo self  \r\n\'cause I\'m bad for your health I come real stealth  \r\nDroppin\' bombs on ya moms fuck car alarms  \r\nDoin\' foul crime, I\'m that nigga wit\'cha Alpine  \r\nSold it for a six-o, always let tricks know  \r\nAnd friends know, we got the indo  \r\nNo I\'m not a sucker, sittin\' in a House of Pain  \r\nAnd no I\'m not the butler, I\'ll cut ya (Uh!)  \r\nHeadbutt ya, you say you can\'t touch this  \r\nAnd I wouldn\'t touch ya, in fact motherfuck ya  \r\nHere to let you know boy, oh boy  \r\nI make dough but don\'t call me DoughBoy  \r\nThis ain\'t no fuckin motion picture  \r\nA guy or bitch-a, I\'ll get wit\'cha  \r\nAnd hit ya, takin that yack to the neck  \r\nSo you better run a c

In [57]:
def no_of_words(text):
    words = text.split()
    word_count = len(words)
    return word_count

In [58]:
df['word count'] = df['text'].apply(no_of_words)

In [73]:
df.head()

Unnamed: 0,artist,song,text,word count
0,Westlife,Somebody Needs You,im human sometimes make mistakes forgive im go...,118
1,Ice Cube,Check Yo Self,yeah word bond das efx ya know im sayin straig...,587
2,Chaka Khan,Maybe Your Baby,im feeling kinda lonely cuz baby done left hea...,155
3,Ne-Yo,Congratulations,verse 1 one best girls went got boo sudden dam...,325
4,Michael Jackson,Here I Am,cant believe real way made feel burning deep i...,112


## Text Cleaning / Preprocessing

In [60]:
def data_processing(text):
    text = text.lower()
    text = re.sub('\r\n','',text)
    text = re.sub(r'\@\w+|\#','',text)
    text = re.sub(r'[^\w\s]','',text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [61]:
df.text = df['text'].apply(data_processing)

In [62]:
df.head()

Unnamed: 0,artist,song,text,word count
0,Westlife,Somebody Needs You,im human sometimes make mistakes forgive im go...,118
1,Ice Cube,Check Yo Self,yeah word bond das efx ya know im sayin straig...,587
2,Chaka Khan,Maybe Your Baby,im feeling kinda lonely cuz baby done left hea...,155
3,Ne-Yo,Congratulations,verse 1 one best girls went got boo sudden dam...,325
4,Michael Jackson,Here I Am,cant believe real way made feel burning deep i...,112


In [63]:
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [64]:
df.text = df['text'].apply(stemming)

In [65]:
vect = TfidfVectorizer(analyzer='word')

In [66]:
matrix = vect.fit_transform(df['text'])

In [67]:
similar = cosine_similarity(matrix)

In [83]:
df[df['artist']=='Green Day']

Unnamed: 0,artist,song,text,word count
553,Green Day,Boulevard Of Broken Songs,walk lonely road one ever known dont know goes...,486
962,Green Day,All The Time,time every time need whats time id say time ri...,162
1755,Green Day,The Grouch,young boy big plans im another shitty old man ...,156
2014,Green Day,Only Of You,wish could tell words would come wrong oh knew...,134
2259,Green Day,Stuart And The Ave,standing corner stuart avenue ripping transfer...,137
2558,Green Day,Brain Stew,im trouble trying sleep im counting sheep runn...,135
2662,Green Day,Jackass,know hate loving must like suicide dont mind d...,177
3029,Green Day,The Eye Of The Tiger,originally survivor rising back street time to...,253
3206,Green Day,Somewhere Now,verse 1 im running late somewhere dont want fu...,202
3505,Green Day,Prosthetic Head,see front line sight sore eyes youre suicide m...,116


## Recommender Function

In [85]:
def recommender(song_name):
    idx = df[df['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similar[idx])), reverse=True, key= lambda x:x[1])
    song = []
    for s_id in distance[1:10]:
        song.append(df.iloc[s_id[0]].song)
    return song

In [86]:
recommender('Boulevard Of Broken Songs')

['Walk On Down',
 'Alien',
 'Walk On By',
 'Thank You For Sending Me An Angel',
 'Never Alone',
 'Walk The Dog',
 'If I Walk Away',
 'Walk Through This World With Me',
 "Don't Walk Away"]

In [88]:
pickle.dump(similar, open("similarity","wb"))

In [89]:
pickle.dump(df, open("df","wb"))