## Importing libraries

In [1]:
import json
import pandas as pd
import spotipy
import config
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
dir(config) # see whether it works

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'client_id',
 'client_secret']

## Activating the Spotify API

In [3]:
#Initialize SpotiPy with user credentias
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id= config.client_id,
                                                           client_secret= config.client_secret))



# The "sp" variable has TWO USEFUl FUNCTIONS:
# The FIRST usefull function is:
# .search(q='',limit=n)
# .search(q="track:"+song_name+" artist:"+artist_name,limit=5) to restrict to a song name and artist.
# Where the "q" keyword is the query you want to perform on spotify: song_name, artist,...
# while The "limit" keyword will limit the number of returned results.
#
# The SECOND usefull function is:
# .audio_features([URL|URI|ID])
# which returns some 'features of the song', that after cleanup, we can use in order to characterize a song.


## Defining a function for analyzing playlists

In [5]:
def playlist_analyzer(user, playlist_id): 
    names = []
    uris = []
    artists = []
    popularity = []
    danceability = []
    energy = []
    key = []
    loudness = []
    mode = []
    speechiness = []
    acousticness = []
    instrumentalness =[]
    liveness=[]
    valence = []
    tempo =[]
    duration_ms =[]
    time_signature =[]
    playlist = spotify.user_playlist_tracks(user=user, playlist_id=playlist_id)
    tracks = playlist['items']
    
    while playlist['next']: # While statement to switch to next page of the playlist
        playlist = spotify.next(playlist)
        tracks.extend(playlist['items'])
   
    for item in tracks:
        names.append(item["track"]["name"])
        uri = item["track"]["uri"]
        uris.append(uri)
        popularity.append(item["track"]["popularity"])
        artists.append([artist["name"] for artist in item["track"]["artists"]])
        features = spotify.audio_features(uri)[0]
        danceability.append(features["danceability"])
        energy.append(features["energy"])
        key.append(features["key"])
        loudness.append(features["loudness"])
        mode.append(features["mode"])
        speechiness.append(features["speechiness"])
        acousticness.append(features["acousticness"])
        instrumentalness.append(features["instrumentalness"])
        liveness.append(features["liveness"])
        valence.append(features["valence"])
        tempo.append(features["tempo"])
        duration_ms.append(features["duration_ms"])
        time_signature.append(features["time_signature"])
    
    # Creating a df from with the song information
    df = pd.DataFrame({"names":names, 
                       "uri": uris,
                       "artists": artists,
                       "popularity": popularity,
                       "danceability":danceability,
                       "energy":energy,
                       "key": key,
                       "loudness":loudness,
                       "mode": mode,
                       "speechiness":speechiness,
                       "acousticness":acousticness,
                       "instrumentalness":instrumentalness,
                       "liveness": liveness,
                       "valence":valence,
                       "tempo": tempo,
                       "duration_ms":duration_ms,
                       "time_signature": time_signature})

    return df

## Data Exploration

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
from matplotlib import pyplot as plt
%matplotlib inline

sns.set(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=False, rc=None)

In [6]:
playlist_df = playlist_analyzer("spotify", "spotify:playlist:6FKDzNYZ8IW1pvYVF4zUN2")
features_playlist = playlist_df.iloc[:,4:17].copy() #Creating a subset of the df with only the features

In [25]:
playlist_df

Unnamed: 0,names,uri,artists,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster
0,Starships,spotify:track:2EBCVPNAG46nbgs6jXPGvv,[Nicki Minaj],74,0.747,0.716,11,-2.457,0,0.0750,0.135000,0.000000,0.2510,0.7510,125.008,210627,4,3
1,Wild Strawberries,spotify:track:6pGUGTIaZ1H4jKHIL4Fged,[PNAU],0,0.647,0.933,7,-4.056,1,0.1110,0.000351,0.002770,0.3340,0.3320,119.921,235107,4,2
2,Papaoutai,spotify:track:09TcIuH1ZO7i4vicWKoaN2,[Stromae],0,0.733,0.818,10,-7.222,0,0.0859,0.024100,0.000000,0.0636,0.2530,116.019,232147,4,3
3,Sweet Dreams (Are Made of This) - Remastered,spotify:track:1TfqLAPs4K3s2rJMoCokcS,"[Eurythmics, Annie Lennox, Dave Stewart]",80,0.692,0.711,0,-7.498,0,0.0317,0.225000,0.000000,0.1200,0.8750,125.135,216933,4,3
4,Rock and Roll - 1990 Remaster,spotify:track:3w2GGz0HjIu9OcWXINRFJR,[Led Zeppelin],17,0.327,0.895,9,-7.428,1,0.0367,0.000564,0.015900,0.1040,0.8980,169.390,219800,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Holdin' On - Skrillex & Nero Remix,spotify:track:6M9L1sPBHbVMpEYV8hYiGU,"[I See MONSTAS, Nero, Skrillex]",54,0.474,0.951,5,-3.000,1,0.0701,0.000303,0.570000,0.6720,0.0775,174.119,237381,4,0
9996,Alberto Balsalm,spotify:track:21Phj46KeUHOWyZW9A9b7P,[Aphex Twin],57,0.792,0.599,3,-13.647,1,0.1410,0.316000,0.931000,0.1110,0.5700,93.994,310747,4,7
9997,Lgbt,spotify:track:0lsw4q8Jei7gEoV7kFe3DS,[cupcakKe],47,0.795,0.857,0,-8.676,0,0.1620,0.172000,0.000018,0.1010,0.4190,124.967,161332,4,3
9998,フライト 日 '89 (FRIDAY),spotify:track:2Oql5y6yNB0XUb8OWbi7pq,[EVADE FROM 宇宙],57,0.501,0.842,5,-6.774,0,0.2800,0.052000,0.006270,0.2460,0.6850,127.840,135058,4,6


In [20]:
playlist_df.shape

(10000, 18)

In [21]:
playlist_df.columns

Index(['names', 'uri', 'artists', 'popularity', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'cluster'],
      dtype='object')

In [28]:
features_playlist

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.747,0.716,11,-2.457,0,0.0750,0.135000,0.000000,0.2510,0.7510,125.008,210627,4
1,0.647,0.933,7,-4.056,1,0.1110,0.000351,0.002770,0.3340,0.3320,119.921,235107,4
2,0.733,0.818,10,-7.222,0,0.0859,0.024100,0.000000,0.0636,0.2530,116.019,232147,4
3,0.692,0.711,0,-7.498,0,0.0317,0.225000,0.000000,0.1200,0.8750,125.135,216933,4
4,0.327,0.895,9,-7.428,1,0.0367,0.000564,0.015900,0.1040,0.8980,169.390,219800,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.474,0.951,5,-3.000,1,0.0701,0.000303,0.570000,0.6720,0.0775,174.119,237381,4
9996,0.792,0.599,3,-13.647,1,0.1410,0.316000,0.931000,0.1110,0.5700,93.994,310747,4
9997,0.795,0.857,0,-8.676,0,0.1620,0.172000,0.000018,0.1010,0.4190,124.967,161332,4
9998,0.501,0.842,5,-6.774,0,0.2800,0.052000,0.006270,0.2460,0.6850,127.840,135058,4


In [18]:
# Checking for null values
playlist_df.isnull().any()

names               False
uri                 False
artists             False
popularity          False
danceability        False
energy              False
key                 False
loudness            False
mode                False
speechiness         False
acousticness        False
instrumentalness    False
liveness            False
valence             False
tempo               False
duration_ms         False
time_signature      False
cluster             False
dtype: bool

In [22]:
playlist_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   names             10000 non-null  object 
 1   uri               10000 non-null  object 
 2   artists           10000 non-null  object 
 3   popularity        10000 non-null  int64  
 4   danceability      10000 non-null  float64
 5   energy            10000 non-null  float64
 6   key               10000 non-null  int64  
 7   loudness          10000 non-null  float64
 8   mode              10000 non-null  int64  
 9   speechiness       10000 non-null  float64
 10  acousticness      10000 non-null  float64
 11  instrumentalness  10000 non-null  float64
 12  liveness          10000 non-null  float64
 13  valence           10000 non-null  float64
 14  tempo             10000 non-null  float64
 15  duration_ms       10000 non-null  int64  
 16  time_signature    10000 non-null  int64  

## Transforming  the data and creating clusters for the playlist dataframe

### StandardScaler transformation

In [9]:
# Initialize the model, fit and transform data
scaler = StandardScaler()
scaler.fit(features_playlist)
features_scaled = scaler.transform(features_playlist)
features_scaled_df = pd.DataFrame(features_scaled)

In [10]:
features_scaled_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.921978,0.028501,1.57922,1.266349,-1.280592,-0.118367,-0.155325,-0.409122,0.347344,0.989897,0.061512,-0.344221,0.129453
1,0.289479,1.099988,0.470957,0.788146,0.780889,0.304795,-0.695565,-0.397106,0.863257,-0.745571,-0.126546,0.149384,0.129453
2,0.833428,0.532149,1.302154,-0.15869,-1.280592,0.009757,-0.600279,-0.409122,-0.817502,-1.072783,-0.270796,0.0897,0.129453
3,0.574104,0.003812,-1.468505,-0.241232,-1.280592,-0.627337,0.205774,-0.409122,-0.466929,1.503496,0.066206,-0.217069,0.129453
4,-1.734516,0.912354,1.025089,-0.220298,0.780889,-0.568565,-0.69471,-0.340151,-0.566382,1.59876,1.702235,-0.15926,0.129453


## KMeans clustering

In [29]:
# Initializing the model, fit and predict clusters
kmeans = KMeans(n_clusters= 100, random_state = 1234)
kmeans.fit(features_scaled)

KMeans(n_clusters=100, random_state=1234)

In [30]:
clusters = kmeans.predict(features_scaled)
playlist_df["cluster"]=clusters

In [31]:
# Check the size of the clusters
pd.Series(clusters).value_counts().sort_index()

0      67
1     150
2     197
3      95
4      68
     ... 
95     37
96    110
97    101
98    146
99    114
Length: 100, dtype: int64

In [33]:
# added the calculated clusters to the overall dataset in form of a new column
playlist_df["cluster"] = clusters
playlist_df.head()

Unnamed: 0,names,uri,artists,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster
0,Starships,spotify:track:2EBCVPNAG46nbgs6jXPGvv,[Nicki Minaj],74,0.747,0.716,11,-2.457,0,0.075,0.135,0.0,0.251,0.751,125.008,210627,4,24
1,Wild Strawberries,spotify:track:6pGUGTIaZ1H4jKHIL4Fged,[PNAU],0,0.647,0.933,7,-4.056,1,0.111,0.000351,0.00277,0.334,0.332,119.921,235107,4,7
2,Papaoutai,spotify:track:09TcIuH1ZO7i4vicWKoaN2,[Stromae],0,0.733,0.818,10,-7.222,0,0.0859,0.0241,0.0,0.0636,0.253,116.019,232147,4,35
3,Sweet Dreams (Are Made of This) - Remastered,spotify:track:1TfqLAPs4K3s2rJMoCokcS,"[Eurythmics, Annie Lennox, Dave Stewart]",80,0.692,0.711,0,-7.498,0,0.0317,0.225,0.0,0.12,0.875,125.135,216933,4,47
4,Rock and Roll - 1990 Remaster,spotify:track:3w2GGz0HjIu9OcWXINRFJR,[Led Zeppelin],17,0.327,0.895,9,-7.428,1,0.0367,0.000564,0.0159,0.104,0.898,169.39,219800,4,14


In [35]:
# selecting all the tracks in cluster seven
playlist_df[playlist_df['cluster'] == 7]

Unnamed: 0,names,uri,artists,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster
1,Wild Strawberries,spotify:track:6pGUGTIaZ1H4jKHIL4Fged,[PNAU],0,0.647,0.933,7,-4.056,1,0.1110,0.000351,0.002770,0.334,0.332,119.921,235107,4,7
36,She Wolf (Falling to Pieces) [feat. Sia],spotify:track:1SgdUjvppHnIp6L7DZSnwc,"[David Guetta, Sia]",66,0.492,0.857,7,-2.634,1,0.0655,0.084100,0.000008,0.344,0.393,129.973,222500,4,7
91,More - RedOne Jimmy Joker Remix,spotify:track:0aBKFfdyOD1Ttvgv0cfjjJ,[Usher],66,0.551,0.893,7,-2.628,1,0.0543,0.001660,0.000000,0.348,0.794,125.083,219987,4,7
257,Fiesta - Michael Mind Project Radio Edit,spotify:track:0vgWM2lDIYxbEpr8jklmph,"[Carlprit, Michael Mind Project]",46,0.597,0.922,6,-5.690,1,0.1490,0.017900,0.000000,0.364,0.726,127.844,234429,4,7
310,Booyah - Radio Edit,spotify:track:0NKH1bGsvz2g0VWJDGXnUd,"[Showtek, We Are Loud, Sonny Wilson]",30,0.559,0.916,11,-3.068,1,0.0626,0.045300,0.000013,0.224,0.195,128.012,215295,4,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9695,HOT DRUM,spotify:track:0gcPFlSLjkVqz84RcDhyJk,[JOYRYDE],44,0.625,0.981,7,-3.799,1,0.0597,0.000900,0.329000,0.357,0.156,125.008,266880,4,7
9716,Give It All,spotify:track:2ZawE8wifPNPb1BoWz9Qin,"[Electrick Village, Rick Derra, Justin Levai]",25,0.655,0.955,7,-3.573,1,0.0786,0.002290,0.404000,0.349,0.481,127.973,158041,4,7
9729,Come on Over,spotify:track:7qRmdDl9dnHkFT4MMboG7Y,[Royal Blood],53,0.505,0.906,7,-5.127,1,0.0399,0.000333,0.000127,0.290,0.553,107.855,173827,4,7
9867,Beastmode - GPF's Greaze Mode Remix,spotify:track:3YY2RDy3pCdpXHIb6FmlEb,"[Killshot, GPF]",42,0.557,0.971,7,-2.236,1,0.0863,0.000206,0.000973,0.320,0.226,109.970,174568,4,7


In [36]:
# mesuring the robustness of the above clustering(similar to MSE in linear regression analysis)
kmeans.inertia_

40355.166364508914

In [37]:
# second round of kmeans clustering with wider set of features
kmeans2 = KMeans(n_clusters=1000,
                init="k-means++",
                n_init=100,  # try with 1, 4, 8, 20, 30, 100...
                max_iter=300, # max number of iterations
                tol=0,
                algorithm="elkan",
                random_state=1234)
kmeans2.fit(features_scaled)
print(kmeans2.inertia_)

17889.51960855788


In [38]:
clusters = kmeans2.predict(features_scaled)
playlist_df["cluster"]=clusters

In [41]:
pd.Series(clusters).value_counts().sort_index()

0       7
1      20
2       4
3      10
4      26
       ..
995    11
996     8
997    18
998     7
999     1
Length: 1000, dtype: int64

In [44]:
# final feature dataframe with 1000 clusters
playlist_df

Unnamed: 0,names,uri,artists,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,cluster
0,Starships,spotify:track:2EBCVPNAG46nbgs6jXPGvv,[Nicki Minaj],74,0.747,0.716,11,-2.457,0,0.0750,0.135000,0.000000,0.2510,0.7510,125.008,210627,4,857
1,Wild Strawberries,spotify:track:6pGUGTIaZ1H4jKHIL4Fged,[PNAU],0,0.647,0.933,7,-4.056,1,0.1110,0.000351,0.002770,0.3340,0.3320,119.921,235107,4,873
2,Papaoutai,spotify:track:09TcIuH1ZO7i4vicWKoaN2,[Stromae],0,0.733,0.818,10,-7.222,0,0.0859,0.024100,0.000000,0.0636,0.2530,116.019,232147,4,185
3,Sweet Dreams (Are Made of This) - Remastered,spotify:track:1TfqLAPs4K3s2rJMoCokcS,"[Eurythmics, Annie Lennox, Dave Stewart]",80,0.692,0.711,0,-7.498,0,0.0317,0.225000,0.000000,0.1200,0.8750,125.135,216933,4,928
4,Rock and Roll - 1990 Remaster,spotify:track:3w2GGz0HjIu9OcWXINRFJR,[Led Zeppelin],17,0.327,0.895,9,-7.428,1,0.0367,0.000564,0.015900,0.1040,0.8980,169.390,219800,4,529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Holdin' On - Skrillex & Nero Remix,spotify:track:6M9L1sPBHbVMpEYV8hYiGU,"[I See MONSTAS, Nero, Skrillex]",54,0.474,0.951,5,-3.000,1,0.0701,0.000303,0.570000,0.6720,0.0775,174.119,237381,4,872
9996,Alberto Balsalm,spotify:track:21Phj46KeUHOWyZW9A9b7P,[Aphex Twin],57,0.792,0.599,3,-13.647,1,0.1410,0.316000,0.931000,0.1110,0.5700,93.994,310747,4,175
9997,Lgbt,spotify:track:0lsw4q8Jei7gEoV7kFe3DS,[cupcakKe],47,0.795,0.857,0,-8.676,0,0.1620,0.172000,0.000018,0.1010,0.4190,124.967,161332,4,684
9998,フライト 日 '89 (FRIDAY),spotify:track:2Oql5y6yNB0XUb8OWbi7pq,[EVADE FROM 宇宙],57,0.501,0.842,5,-6.774,0,0.2800,0.052000,0.006270,0.2460,0.6850,127.840,135058,4,940
