In [87]:
import pandas as pd

In [88]:
df = pd.read_csv("./dataset/spotify_millsongdata.csv")

In [89]:
df.head(10)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong \r\nYou're e..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,I was out with the morning sun \r\nCouldn't s...
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby \r\nI'm sitting all ...


In [90]:
df.isnull().sum() #checking for missing data

artist    0
song      0
link      0
text      0
dtype: int64

In [91]:
df = df.sample(5000).drop("link",axis=1).reset_index(drop=True)

In [92]:
df.head(15)

Unnamed: 0,artist,song,text
0,Imperials,Water Grave,In my house there's been a mercy killing. \r\...
1,Horrible Histories,Transportation,[Intro] \r\nWith imagination \r\nInspiration...
2,Nickelback,Next Contestant,I judge by what she's wearing \r\nJust how ma...
3,Willie Nelson,Honky Tonk Women,Well I met a California girl in Memphis \r\nA...
4,Dean Martin,I Will,I don't want to be the one to say I'm gonna mi...
5,Engelbert Humperdinck,You Are The Sunshine Of My Life,You are the sunshine of my life \r\nThat's wh...
6,Frank Sinatra,Azure-Te (Paris Blue),"Gone and got the blues in Paris, \r\nParis bl..."
7,Isley Brothers,It's A Disco Night (Rock Don't Stop),"Baby, the place is rockin' \r\nIt's a disco n..."
8,Vince Gill,Cowboy Up,I'm puttin' on my favorite cowboy boots \r\nL...
9,Engelbert Humperdinck,Blue Bayou,"I feel so bad, I got a worried mind \r\nI'm s..."


In [93]:
df['text'][3]

"Well I met a California girl in Memphis  \r\nAnd she tried to take me upstairs for a ride  \r\nAnd she had to hold me right across her shoulder  \r\nAnd I just can't seem to get her off of my mind  \r\nThese honky tonk women gonna give me give give me the honky tonk blues  \r\nShe's a honky tonk woman give me give give me the honky tonk blues  \r\n  \r\nWell I later did the same in New York City  \r\nAnd I had to pull out of some kind of a fight  \r\nAnd the lady she covered me with roses  \r\nShe blew my nose and then she blew my mind  \r\nHonky tonk women  \r\nAnd she had to hold me  \r\nShe's a honky tonk woman  \r\nHonky tonk women  \r\nShe's a honky tonk woman\r\n\r\n"

## Cleaning the Text data 

In [94]:
df['text'] = df['text'].str.lower().replace(r'^\W\S','').replace(r'\n',' ', regex=True)

### Tokenizing the text words and then vectorizing the words to vector and using cosine similarity distance between vectors we are recommending the songs

In [95]:
import nltk
from nltk.stem.porter import PorterStemmer


from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


nltk.download('punkt_tab', download_dir='./nltk_data')
nltk.data.path.append('./nltk_data')
#Using Stemming to tokenize the words

[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [96]:
stemmer = PorterStemmer()

In [97]:
def token(txt):
    token = nltk.word_tokenize(txt)
    stemed =  [stemmer.stem(w) for w in token]
    
    return " ".join(stemed)

In [98]:
token("i am listening to music")

'i am listen to music'

In [99]:
df['text'].apply(lambda x: token(x))

0       in my hous there 's been a merci kill . the ma...
1       [ intro ] with imagin inspir innov and perspir...
2       i judg by what she 's wear just how mani head ...
3       well i met a california girl in memphi and she...
4       i do n't want to be the one to say i 'm gon na...
                              ...                        
4995    at first flash of eden , we race down to the s...
4996    oh , you look so tire mouth slack and wide ill...
4997    from the backdoor of your life you swept me ou...
4998    do n't kiss my lip and say good-by , it make m...
4999    ( speech : ) ok here we go , for thi we got a ...
Name: text, Length: 5000, dtype: object

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#for vectorization - TF_IDF

In [101]:
tfidf = TfidfVectorizer(analyzer='word',stop_words='english')

In [102]:
matrix = tfidf.fit_transform(df['text'])

In [103]:
similarity = cosine_similarity(matrix)

In [104]:
similarity[4999]

array([0.02148391, 0.00403812, 0.02869569, ..., 0.02013428, 0.02487519,
       1.        ])

In [105]:
df.head(10)

Unnamed: 0,artist,song,text
0,Imperials,Water Grave,in my house there's been a mercy killing. \r ...
1,Horrible Histories,Transportation,[intro] \r with imagination \r inspiration ...
2,Nickelback,Next Contestant,i judge by what she's wearing \r just how man...
3,Willie Nelson,Honky Tonk Women,well i met a california girl in memphis \r an...
4,Dean Martin,I Will,i don't want to be the one to say i'm gonna mi...
5,Engelbert Humperdinck,You Are The Sunshine Of My Life,you are the sunshine of my life \r that's why...
6,Frank Sinatra,Azure-Te (Paris Blue),"gone and got the blues in paris, \r paris blu..."
7,Isley Brothers,It's A Disco Night (Rock Don't Stop),"baby, the place is rockin' \r it's a disco ni..."
8,Vince Gill,Cowboy Up,i'm puttin' on my favorite cowboy boots \r li...
9,Engelbert Humperdinck,Blue Bayou,"i feel so bad, i got a worried mind \r i'm so..."


# Recommendation funcion 


In [None]:
def recommender(song_name):
    idx = df[df['song'] == song_name].index[0]
    distance = sorted(list(enumerate(similarity[idx])), reverse=True, key =lambda x: x[1])
    song = []
    for sid in distance[1:5]:
        song.append(df.iloc[sid[0]].song)

    return song

In [109]:
recommender("Transportation")

['Hold On', 'Dance', 'These Days', "I Don't Want To Know If You're Lonely"]

## dumping in a file

In [110]:
import pickle

In [111]:
pickle.dump(similarity, open("similarity","wb"))

In [112]:
pickle.dump(df, open("dataframe","wb"))