collaborative filtering based music recommendation system

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
sns.set()

In [None]:
!unzip spotify_dataset.zip

Archive:  spotify_dataset.zip
  inflating: data/data.csv           
  inflating: data/data_by_artist.csv  
  inflating: data/data_by_genres.csv  
  inflating: data/data_by_year.csv   
  inflating: data/data_w_genres.csv  


In [None]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [None]:
data.isnull().sum()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

In [None]:
df = data.drop(columns =['id','name','artists','release_date','year']) #dealing with only numerical type columns
df.fillna(0)
df.corr()

Unnamed: 0,valence,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,features
valence,1.0,-0.184101,0.558946,-0.191813,0.353876,-0.018613,-0.198501,0.028473,0.003832,0.313512,0.015641,0.0142,0.046381,0.171689,-0.017261
acousticness,-0.184101,1.0,-0.266852,-0.076373,-0.749393,-0.246007,0.329819,-0.02055,-0.024482,-0.561696,0.047168,-0.573162,-0.04398,-0.20712,0.083793
danceability,0.558946,-0.266852,1.0,-0.139937,0.221967,0.241757,-0.278063,0.024439,-0.100193,0.285057,-0.045956,0.199606,0.235491,0.001801,-0.027364
duration_ms,-0.191813,-0.076373,-0.139937,1.0,0.042119,-0.04888,0.08477,-0.004266,0.047168,-0.003037,-0.046085,0.059597,-0.084604,-0.025472,-0.047217
energy,0.353876,-0.749393,0.221967,0.042119,1.0,0.132723,-0.281101,0.027705,0.126192,0.782362,-0.03926,0.485005,-0.070555,0.250865,-0.113134
explicit,-0.018613,-0.246007,0.241757,-0.04888,0.132723,1.0,-0.140987,0.005432,0.03964,0.1403,-0.078872,0.191543,0.41407,0.011969,0.05321
instrumentalness,-0.198501,0.329819,-0.278063,0.08477,-0.281101,-0.140987,1.0,-0.014591,-0.047193,-0.408611,-0.036543,-0.29675,-0.1217,-0.105361,0.073809
key,0.028473,-0.02055,0.024439,-0.004266,0.027705,0.005432,-0.014591,1.0,0.000205,0.017385,-0.11626,0.007826,0.023784,0.002629,0.000227
liveness,0.003832,-0.024482,-0.100193,0.047168,0.126192,0.03964,-0.047193,0.000205,1.0,0.056422,0.002641,-0.076464,0.134667,0.007714,0.046167
loudness,0.313512,-0.561696,0.285057,-0.003037,0.782362,0.1403,-0.408611,0.017385,0.056422,1.0,-0.010727,0.457051,-0.139296,0.209774,-0.183985


In [None]:
#normalization
from sklearn.preprocessing import MinMaxScaler
datatypes = ['int16', 'int32', 'int64' , 'float16' , 'float32','float64']
normalization = data.select_dtypes(include=datatypes)
for col in normalization.columns:
    MinMaxScaler(col)

In [None]:
#kmeans algo
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
features = kmeans.fit_predict(normalization)
data['features'] = features
MinMaxScaler(data['features'])



In [None]:
#recommendation system

class Spotify_Recommendation():
    def __init__(self, dataset):
        self.dataset = dataset
    def recommend(self, songs, amount=1): #recom 1 song
        distance = []
        song = self.dataset[(self.dataset.name.str.lower() == songs.lower())].head(1).values[0] #convert name to lowercase
        rec = self.dataset[self.dataset.name.str.lower() != songs.lower()]
        for songs in tqdm(rec.values): #filter songs n
            d = 0
            for col in np.arange(len(rec.columns)):
                try:
                        d += np.absolute(float(song[col]) - float(songs[col]))
                except ValueError:
                        continue

            distance.append(d)
        rec['distance'] = distance
        rec = rec.sort_values('distance')
        columns = ['artists', 'name']
        return rec[columns][:amount]



In [None]:
recommendations = Spotify_Recommendation(data)
recommendations.recommend("Lovers Rock", 10)

100%|██████████| 170651/170651 [00:08<00:00, 20527.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec['distance'] = distance


Unnamed: 0,artists,name
35546,['Shinedown'],Save Me
108860,"['Summer Walker', 'PARTYNEXTDOOR']",My Affection (with PARTYNEXTDOOR)
16132,['O-Zone'],Dragostea Din Tei
75017,"[""Olivia O'Brien""]",Love Myself
73472,['The English Beat'],Save It For Later
168327,['Regina Spektor'],Folding Chair
90808,['Beto Y Sus Canarios'],Tuve una Novia
57298,"['Anuel AA', 'Travis Barker']",No Llores Mujer
140709,['Sam Smith'],Diamonds
53091,"['Naughty By Nature', 'Zhané']",Jamboree (feat. Zhané)
