**Import Libraries**

In [49]:
import pandas as pd
import spacy
import nltk
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

**Load Dataset**

In [3]:
df = pd.read_csv("/content/spotify_millsongdata.csv", on_bad_lines='skip')


In [4]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [5]:
df.sample(5)

Unnamed: 0,artist,song,link,text
34963,Guns N' Roses,Reckless Life,/g/guns+n+roses/reckless+life_10139506.html,I'm reckless and feelin' no pain \r\nYou know...
13532,Natalie Imbruglia,Hurricane,/n/natalie+imbruglia/hurricane_20098250.html,It came on like a hurricane \r\nAnd I don't u...
2952,Christina Perri,My Eyes,/c/christina+perri/my+eyes_20911639.html,My eyes are so tired \r\nThey should sleep \...
6610,George Harrison,Devil's Radio,/g/george+harrison/devils+radio_20059050.html,"Gossip, gossip \r\nGossip, gossip \r\n \r\n..."
10142,Katy Perry,Bad Dream,/k/katy+perry/bad+dream_20916151.html,I feel like \r\nI've been watching \r\nThe s...


In [6]:
df.shape

(57650, 4)

In [7]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [10]:
df['text'][0]

"Look at her face, it's a wonderful face  \r\nAnd it means something special to me  \r\nLook at the way that she smiles when she sees me  \r\nHow lucky can one fellow be?  \r\n  \r\nShe's just my kind of girl, she makes me feel fine  \r\nWho could ever believe that she could be mine?  \r\nShe's just my kind of girl, without her I'm blue  \r\nAnd if she ever leaves me what could I do, what could I do?  \r\n  \r\nAnd when we go for a walk in the park  \r\nAnd she holds me and squeezes my hand  \r\nWe'll go on walking for hours and talking  \r\nAbout all the things that we plan  \r\n  \r\nShe's just my kind of girl, she makes me feel fine  \r\nWho could ever believe that she could be mine?  \r\nShe's just my kind of girl, without her I'm blue  \r\nAnd if she ever leaves me what could I do, what could I do?\r\n\r\n"

**Text Preprocessing**

**Remove Stopwords and Punctuations**

In [12]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
# Load SpaCy model
nlp = spacy.load('en_core_web_lg')



In [15]:
def clean_text(text):
    # Process text with SpaCy
    doc = nlp(text)
    # Tokenize and remove stopwords, punctuation
    cleaned_text = ' '.join(token.text for token in doc if not token.is_stop and not token.is_punct)
    return cleaned_text.lower()


In [24]:
df_new = df.sample(5000).reset_index(drop=True)

In [25]:
df_new.shape

(5000, 4)

In [26]:
df_new.head()

Unnamed: 0,artist,song,link,text
0,Judy Garland,I'm Always Chasing Rainbows,/j/judy+garland/im+always+chasing+rainbows_202...,"At the end of the rainbow there's happiness, ..."
1,Cyndi Lauper,Unconditional Love,/c/cyndi+lauper/unconditional+love_20270345.html,And I want to be everything \r\nYou want me t...
2,Nickelback,Little Friend,/n/nickelback/little+friend_20100412.html,Manhunt to find your daughter \r\nBuried unde...
3,Chris Brown,Favorite Girl,/c/chris+brown/favorite+girl_21065811.html,"Up down, up down won't you give me a plenty gi..."
4,Alice Cooper,Never Been Sold Before,/a/alice+cooper/never+been+sold+before_2028674...,"You ask me, baby \r\nCan you work tonight \r..."


In [27]:
df_new['text'][0]

"At the end of the rainbow there's happiness,  \r\nAnd to find it how often I've tried,  \r\nBut my life is a race, just a wild goose chase,  \r\nAnd my dreams have all been denied.  \r\nWhy have I always been a failure?  \r\nWhat can the reason be?  \r\nI wonder if the world's to blame,  \r\nI wonder if it could be me.  \r\nChorus:  \r\nI'm always chasing rainbows,  \r\nWatching clouds drifting by,  \r\nMy dreams are just like all my schemes,  \r\nEnding in the sky.  \r\nSome fellows look and find the sunshine,  \r\nI always look and find the rain.  \r\nSome fellows make a winning sometime,  \r\nI never even make a gain, believe me,  \r\nI'm always chasing rainbows,  \r\nI'm watching for a little bluebird in vain.\r\n\r\n"

In [28]:
df_new['text'] = df_new['text'].apply(clean_text)

In [29]:
df_new['text'][0]

'end rainbow happiness  \r\n find tried  \r\n life race wild goose chase  \r\n dreams denied  \r\n failure  \r\n reason  \r\n wonder world blame  \r\n wonder  \r\n chorus  \r\n chasing rainbows  \r\n watching clouds drifting  \r\n dreams like schemes  \r\n ending sky  \r\n fellows look find sunshine  \r\n look find rain  \r\n fellows winning  \r\n gain believe  \r\n chasing rainbows  \r\n watching little bluebird vain \r\n\r\n'

In [34]:
stemmer = PorterStemmer()
nltk.download('punkt')
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [35]:
tokenization('you are beautiful beauti beaty')

['you', 'are', 'beautiful', 'beauti', 'beaty']


'you are beauti beauti beati'

In [36]:
df_new['text'] = df_new['text'].apply(tokenization)

['end', 'rainbow', 'happiness', 'find', 'tried', 'life', 'race', 'wild', 'goose', 'chase', 'dreams', 'denied', 'failure', 'reason', 'wonder', 'world', 'blame', 'wonder', 'chorus', 'chasing', 'rainbows', 'watching', 'clouds', 'drifting', 'dreams', 'like', 'schemes', 'ending', 'sky', 'fellows', 'look', 'find', 'sunshine', 'look', 'find', 'rain', 'fellows', 'winning', 'gain', 'believe', 'chasing', 'rainbows', 'watching', 'little', 'bluebird', 'vain']
['want', 'want', 'sure', 'know', 'lose', 'faith', 'lose', 'ground', 'remember', 'remember', 'unconditional', 'love', 'love', 'love', 'unconditional', 'love', 'matter', "'cause", 'taking', 'wrong', 'way', 'read', 'mind', 'fight', 'time', 'surrender', 'unconditional', 'love', 'love', 'love', 'unconditional', 'love', 'love', 'standing', 'wilder', 'shore', 'got', 'head', 'clouds', 'oh', 'ai', 'got', 'sense', 'direction', 'want', 'lie', 'want', 'lie', 'want', 'want', 'want', 'want', 'wish', 'sky', 'sure', 'know', 'imagine', 'clouds', 'look', 'like

In [40]:
df_new.tail(3)

Unnamed: 0,artist,song,link,text
4997,Rush,The Present Tense,/r/rush/the+present+tense_20709852.html,live present tens lose past futur make sens li...
4998,Lucky Dube,False Prophets,/l/lucky+dube/false+prophets_20196160.html,everyday movin hold bibl hand yeah go church s...
4999,Green Day,Brown Eyed Girl,/g/green+day/brown+eyed+girl_20631105.html,hey day rain came hollow playin new game laugh...


In [42]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df_new['text'])
similarity = cosine_similarity(matrix)

In [43]:
similarity[0]

array([1.        , 0.01763505, 0.03244854, ..., 0.02747999, 0.01481583,
       0.01053076])

In [46]:
def recommendation(song_df):
    idx = df_new[df_new['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])

    songs = []
    for m_id in distances[1:10]:
        songs.append(df_new.iloc[m_id[0]].song)

    return songs

In [48]:
recommendation('Favorite Girl')

['Dedicated To My Favourite Girl',
 "I'll Never Find Someone Like You",
 'Same Girl',
 'My Girl',
 'Another Girl',
 '2nd Round',
 'Only Girl In The World',
 'Fly Girl',
 'His Mistakes']

In [50]:
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df_new,open('df.pkl','wb'))