In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import regex as re
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# loading the files as csv
data = pd.read_csv(r'/content/spotify_millsongdata.csv')

In [None]:
# See the top 5 data
data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [None]:
# See the data last 5
data.tail()

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


In [None]:
# data information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [None]:
# Checking the null values
data.isna().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [None]:
data.shape

(57650, 4)

In [None]:
# droping unwanted column because its not require
data.drop("link",axis=1,inplace=True)

In [None]:
# Reseting the index
data.reset_index(drop=True)

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...
...,...,...,...
57645,Ziggy Marley,Good Old Days,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,northern star \r\nam i frightened \r\nwhere ...


In [None]:
# Text Preprocessing to clear
def text_clean(text):
    text = text.lower() # lowering all the text
    text = re.sub(r'[^a-z\s]', '', text) # removing all special characters
    text = re.sub(r'\s+', ' ', text).strip() # removing white spaces
    return text


In [None]:
# applying functions to remove the data
data["text"]=data["text"].apply(text_clean)

In [None]:
# Now you can see the data is clean in text columns
data["text"]

Unnamed: 0,text
0,look at her face its a wonderful face and it m...
1,take it easy with me please touch me gently li...
2,ill never know why i had to go why i had to pu...
3,making somebody happy is a question of give an...
4,making somebody happy is a question of give an...
...,...
57645,irie days come on play let the angels fly let ...
57646,power to the workers more power power to the w...
57647,all you need is something ill believe flashlig...
57648,northern star am i frightened where can i go t...


### Now The cleaning process is over . Computers cant understand the textual data so we are going to convert textual data to vectors

In [None]:
# Now tokenize the corpus into a words so its easy to vector
stemmer = PorterStemmer()

In [None]:
# Word tokenization , splitting the words
def tokens(text):
    token = nltk.word_tokenize(text)
    a = [stemmer.stem(i) for i in token]
    return " ".join(a)

In [None]:
# Applying the function to word tokenize
data["text"].apply(tokens)

Unnamed: 0,text
0,look at her face it a wonder face and it mean ...
1,take it easi with me pleas touch me gentli lik...
2,ill never know whi i had to go whi i had to pu...
3,make somebodi happi is a question of give and ...
4,make somebodi happi is a question of give and ...
...,...
57645,iri day come on play let the angel fli let the...
57646,power to the worker more power power to the wo...
57647,all you need is someth ill believ flashlight i...
57648,northern star am i frighten where can i go to ...


In [None]:
df=data.sample(5000)

In [None]:
# All the these things are done before the things are required in vectorizations

tfid = TfidfVectorizer(stop_words="english", analyzer="word",max_features=5000)

In [None]:
# Fitting the model and transforming
matrix=tfid.fit_transform(df["text"])

In [None]:
# Cosine
cosine = cosine_similarity(matrix)

In [None]:
cosine

array([[1.00000000e+00, 1.14163552e-02, 6.84513173e-02, ...,
        9.41648841e-04, 3.14151722e-02, 3.31808541e-02],
       [1.14163552e-02, 1.00000000e+00, 2.11632433e-02, ...,
        2.24297037e-03, 1.28603294e-02, 8.91475056e-03],
       [6.84513173e-02, 2.11632433e-02, 1.00000000e+00, ...,
        1.43489404e-02, 7.65746687e-02, 1.82584447e-02],
       ...,
       [9.41648841e-04, 2.24297037e-03, 1.43489404e-02, ...,
        1.00000000e+00, 2.38618085e-03, 3.33971811e-03],
       [3.14151722e-02, 1.28603294e-02, 7.65746687e-02, ...,
        2.38618085e-03, 1.00000000e+00, 2.31871191e-02],
       [3.31808541e-02, 8.91475056e-03, 1.82584447e-02, ...,
        3.33971811e-03, 2.31871191e-02, 1.00000000e+00]])

In [None]:
df.samp

Unnamed: 0,artist,song,text
10434,Kenny Rogers,Bed Of Rose,youre that one special woman i thought id neve...
24124,Andy Williams,Lonely Street,wheres this place called lonely streeeeeeeet i...
11151,Lana Del Rey,Flipside,are you gonna hurt me now are you gonna hurt m...
14179,Nine Inch Nails,Every Day Is Exactly The Same,i believe i can see the future cause i repeat ...
38942,Judas Priest,Lochness,grey mist drifts upon the water the mirrored s...
...,...,...,...
27697,Christina Aguilera,Genie 2.0,i feel like ive been locked up tight a century...
20772,Veruca Salt,Best You Can Get,shes all crazy and shit she shows us her tits ...
42828,Marianne Faithfull,Sign Of Judgement,yes sign of judgement yes sign of judgement ye...
28077,Cliff Richard,Great Balls Of Fire,you shake my nerves and you rattle my brain to...


In [None]:
df[df["song"]=="Bed Of Rose"]

Unnamed: 0,artist,song,text
10434,Kenny Rogers,Bed Of Rose,youre that one special woman i thought id neve...


In [None]:
df[df["song"]=="Bed Of Rose"].index[0]


10434

In [None]:
# Making the recommendation model

def recommendation(song_name):
  idx = df[df["song"]==song_name].index[0]  # it will give the index of the search songs
  distance = sorted(list(enumerate(cosine[idx])), reverse=True , key = lambda x : x[1]) # it will sorting the cosine with index
  song = []
  for i in distance[1:6]:
    song.append(df.iloc[i[0]].song)
  return song


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Exporting the file
import pickle
pickle.dump(cosine,open("similarity.pkl", "wb"))
pickle.dump(df,open("df.pkl", "wb"))