In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df=pd.read_csv("songdata.csv")

In [28]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


# Data analysing

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [30]:
df.shape

(57650, 4)

In [31]:
df['artist'].value_counts()

artist
Donna Summer        191
Gordon Lightfoot    189
Bob Dylan           188
George Strait       188
Loretta Lynn        187
                   ... 
Zazie                 2
Zed                   1
Zoe                   1
X-Treme               1
U-Kiss                1
Name: count, Length: 643, dtype: int64

In [32]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [33]:
df.duplicated().sum()

0

# Data processing

In [36]:
df= df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [92]:
df

Unnamed: 0,artist,song,text
0,Jason Mraz,Long Drive,"long drive, long night the best night of my li..."
1,Vybz Kartel,Realest Thing,ah preston wha di mattar skin girl like ack an...
2,Glen Campbell,Freeborn Man,i wa born in southland twenti some odd year ag...
3,Utopia,Forgotten But Not Gone,she' been good to me she' been as kind as a gi...
4,Billy Joel,She's Got A Way,she' got a way about her i don't know what it ...
...,...,...,...
4995,Diana Ross,Had You Been Around,i might have known a lot of thing been smile u...
4996,Ugly Kid Joe,Clover,"vers 1 i'm, protect by the leav of the four le..."
4997,Oasis,Love Like A Bomb,"i see you, a woman, a world in my mind. girl i..."
4998,Alan Parsons Project,Fall Free,"(bairnson, elliott, parsons) what' the use of ..."


# cleaning 

In [38]:
df['text'][0]

"Long drive, long night  \nThe best night of my life  \nWith you ridin'  \nYour hand on my hand  \n  \nThe thought of arriving  \nKinda feels like dying  \nI don't want to go home  \nAnd be alone  \n  \nCould we stay out  \nCould you drive a little slower  \nDon't matter where we're going  \nAs long as I'm with you  \nWe can take the long way  \n  \nChevy Nova  \nFront seat sofa  \nGetting closer  \nTo you  \n  \nDrive a little slower  \nDon't matter where we're going  \nAs long as I'm with you  \nWe can take the long way  \n  \nDrive a little slower  \nNot ready to go home  \nI rather stay with you  \nWe can take the long way out  \n  \nTo the country, out of town  \nLet's get lost, I don't want to be found  \nLet's get away now  \n  \nAnd be careful not to crash  \nIt's known there's frost and we're steaming the glass  \nYou hit the road, have a generous shoulder  \nWe could pull over  \nAnd say we took the long way  \n  \nHeadlights  \nStrobe lights  \nI can see you  \nBut not quite

In [41]:
df['text'].str.lower().replace(r'[\w\s]','')

0       long drive, long night  \nthe best night of my...
1       ah preston wha di mattar  \nskin girls like ac...
2       i was born in southland twenty some odd years ...
3       she's been good to me  \nshe's been as kind as...
4       she's got a way about her  \ni don't know what...
                              ...                        
4995    i might have known a lot of things  \nbeen smi...
4996    verse 1  \ni'm, protected by the leaves of the...
4997    i see you, a woman, a world in my mind.  \ngir...
4998    (bairnson, elliott, parsons)  \n  \nwhat's the...
4999    (traditional)  \naway in a manger no crib for ...
Name: text, Length: 5000, dtype: object

In [48]:
df['text'] =df['text']=df['text'].replace('\\n','',regex=True)

In [49]:
df['text'][0]

"Long drive, long night  The best night of my life  With you ridin'  Your hand on my hand    The thought of arriving  Kinda feels like dying  I don't want to go home  And be alone    Could we stay out  Could you drive a little slower  Don't matter where we're going  As long as I'm with you  We can take the long way    Chevy Nova  Front seat sofa  Getting closer  To you    Drive a little slower  Don't matter where we're going  As long as I'm with you  We can take the long way    Drive a little slower  Not ready to go home  I rather stay with you  We can take the long way out    To the country, out of town  Let's get lost, I don't want to be found  Let's get away now    And be careful not to crash  It's known there's frost and we're steaming the glass  You hit the road, have a generous shoulder  We could pull over  And say we took the long way    Headlights  Strobe lights  I can see you  But not quite  I can feel you  Inside  The timing  Is just right  For the moment  I don't want  To go

In [50]:
import nltk
from nltk.stem.porter import PorterStemmer 
ps=PorterStemmer()

In [51]:
def stem(text):
    y = [ps.stem(word) for word in text.split()]
    return " ".join(y)

In [52]:
df['text']= df['text'].apply(stem)

In [53]:
df['text'][0]

"long drive, long night the best night of my life with you ridin' your hand on my hand the thought of arriv kinda feel like die i don't want to go home and be alon could we stay out could you drive a littl slower don't matter where we'r go as long as i'm with you we can take the long way chevi nova front seat sofa get closer to you drive a littl slower don't matter where we'r go as long as i'm with you we can take the long way drive a littl slower not readi to go home i rather stay with you we can take the long way out to the country, out of town let' get lost, i don't want to be found let' get away now and be care not to crash it' known there' frost and we'r steam the glass you hit the road, have a gener shoulder we could pull over and say we took the long way headlight strobe light i can see you but not quit i can feel you insid the time is just right for the moment i don't want to go home take the long way out oh, oh, oh take the long way now oh, oh, oh drive a littl slower don't ma

# Vactorisation

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english',max_features=5000)
vector = tfidf.fit_transform(df['text'])

In [76]:
vector[0]

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 54 stored elements in Compressed Sparse Row format>

In [77]:
vector.shape

(5000, 5000)

In [78]:
tfidf.get_feature_names_out() #this is used to get that 5000 words from vector

array(['000', '10', '12', ..., 'zone', 'zoo', 'zoom'], dtype=object)

In [79]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity.shape

(5000, 5000)

In [80]:
similarity[0]

array([1.        , 0.00201452, 0.04295649, ..., 0.00302954, 0.02095891,
       0.05280675])

In [81]:
 sorted(list(enumerate(similarity[0])),reverse=True,key = lambda x: x[1])[1:6]

[(482, 0.3991163065520288),
 (158, 0.3873541611818723),
 (3734, 0.34897446491076184),
 (2261, 0.34324601144947353),
 (1810, 0.33045590509529843)]

In [90]:
df['song'][0]

'Long Drive'

In [91]:
df[df['song']=='Long Drive'].index[0]

0

In [86]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [93]:
recommendation('Long Drive')

['Long Way Home',
 'How Long Will I Love You',
 'Touch And Go',
 'Drives Me Crazy',
 'Dead End',
 'A Long Way From Home',
 'Long Way Down',
 'Prisoners',
 'Time Has Come',
 'A Change Is Gonna Come',
 'Long Road',
 'Will The Future Blame Us',
 'How Long',
 'Long Gone And Moved On',
 'Warpath',
 'Just As Long As You Are There',
 'No Matter Who',
 'A Long Night',
 'Playing Canasta In Cold Rooms',
 'So Long Frank Lloyd Wright']

In [None]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df['song'],open('songslist.pkl','wb'))