# Unsupervised Clustering of Audio Features to Generate Playlists

In [116]:
import numpy as np
import pandas as pd

from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Data

In [2]:
tracks_features = pd.read_csv('tracks_features.csv', index_col='id')

In [4]:
tracks_features.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2QaG8LwDIs9WYI8e0B6VPi,0.537,0.535,3,-5.842,1,0.0284,0.0399,3.3e-05,0.067,0.324,117.895,313187,3
1HpjlU33hPxFR0Px0Vp2ka,0.322,0.336,3,-8.049,1,0.0319,0.769,0.484,0.105,0.0882,120.113,342347,3
4UADR6fNQfx4fxkiRQvSy2,0.483,0.695,0,-5.493,1,0.0313,0.00461,0.000661,0.102,0.294,126.212,295333,4
1a1SQeSqUKzH5OUVTEx4ae,0.396,0.177,5,-10.277,1,0.0378,0.718,0.0,0.139,0.221,177.215,201694,3
6tBv2nLRHBEmjVDOlxctbg,0.273,0.308,3,-14.243,1,0.0311,0.948,0.934,0.0933,0.0387,79.847,213629,4


In [37]:
from sklearn.preprocessing import scale

In [40]:
tracks_features_scaled = pd.DataFrame(scale(tracks_features), columns=tracks_features.columns, index=tracks_features.index)

In [41]:
tracks_features_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2QaG8LwDIs9WYI8e0B6VPi,0.569058,1.464513,-0.523603,1.558327,0.537412,-0.463768,-2.583128,-0.619616,-0.738089,0.709972,0.134508,1.124353,-1.481179
1HpjlU33hPxFR0Px0Vp2ka,-0.951977,0.369518,-0.523603,1.114896,0.537412,-0.326082,0.067646,0.832469,-0.330023,-0.881728,0.210506,1.560711,-1.481179
4UADR6fNQfx4fxkiRQvSy2,0.18703,2.34491,-1.362345,1.628448,0.537412,-0.349685,-2.711431,-0.617733,-0.362238,0.507465,0.419482,0.857181,0.327393
1a1SQeSqUKzH5OUVTEx4ae,-0.428458,-0.505376,0.035559,0.667245,0.537412,-0.093982,-0.117773,-0.619716,0.035089,0.0147,2.167048,-0.544056,-1.481179
6tBv2nLRHBEmjVDOlxctbg,-1.298631,0.215449,-0.523603,-0.129605,0.537412,-0.357553,0.718433,2.182642,-0.455664,-1.215863,-1.169168,-0.365458,0.327393


In [42]:
features = ['energy', 'acousticness', 'instrumentalness', 'tempo']

In [131]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, width=1000, height=800)

## DBSCAN

In [17]:
from sklearn.cluster import DBSCAN

In [94]:
DBSCAN_clustering = DBSCAN(eps=0.60, metric='euclidean').fit(tracks_features[features])

In [95]:
num_clusters = len(set(DBSCAN_clustering.labels_))
print(f'{num_clusters} clusters found')

22 clusters found


In [96]:
DBSCAN_clustering.labels_

array([-1,  0, -1, ...,  1,  1,  2], dtype=int64)

In [97]:
px.histogram(x=DBSCAN_clustering.labels_, title='Distribution of DBSCAN Cluster Points')

In [113]:
# TODO: Compute clustering metrics

In [109]:
# Add clustering to dataframe
tracks_features['DBSCAN_cluster'] = DBSCAN_clustering.labels_

In [110]:
# filter out clusters that are too small
filt = tracks_features.DBSCAN_cluster.value_counts(normalize=True) < 0.05
filt = list(filt[tracks_features.DBSCAN_cluster])
tracks_features.DBSCAN_cluster[filt] = -1

In [117]:
DBSCAN_sil = metrics.silhouette_score(tracks_features[features], DBSCAN_clustering.labels_, metric='euclidean')
print('silhouette score: ', DBSCAN_sil)

silhouette score:  -0.22031588478629896


In [118]:
DBSCAN_calinksi = metrics.calinski_harabasz_score(tracks_features[features], DBSCAN_clustering.labels_)
print('Calinki-Harabasz Index: ', DBSCAN_calinksi)

Calinki-Harabasz Index:  99.37846369747496


In [119]:
DBSCAN_DB = metrics.davies_bouldin_score(tracks_features[features], DBSCAN_clustering.labels_)
print('Davies-Bouldin Score: ', DBSCAN_DB)

Davies-Bouldin Score:  3.855238550445556


In [133]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
color=tracks_features.DBSCAN_cluster.astype(str),
 width=1000, height=800)

## Affinity Propagation 

In [3]:
from sklearn.cluster import AffinityPropagation

In [112]:
AP_clustering = AffinityPropagation(affinity='euclidean').fit(tracks_features[features])

In [115]:
set(AP_clustering.labels_)

{-1}

Affinity propagation clustering didn't converge...

# KMeans

In [120]:
from sklearn.cluster import KMeans

In [121]:
kmeans_clustering = KMeans(n_clusters=4).fit(tracks_features[features])

In [122]:
set(kmeans_clustering.labels_)

{0, 1, 2, 3}

In [123]:
px.histogram(x=kmeans_clustering.labels_, title='Distribution of KMeans Cluster Points')

In [125]:
#Add clustering to dataframe
tracks_features['KMeans_cluster'] = kmeans_clustering.labels_

In [126]:
KMeans_sil = metrics.silhouette_score(tracks_features[features], kmeans_clustering.labels_, metric='euclidean')
print('silhouette score: ', KMeans_sil)

KMeans_calinksi = metrics.calinski_harabasz_score(tracks_features[features], kmeans_clustering.labels_)
print('Calinki-Harabasz Index: ', KMeans_calinksi)

KMeans_DB = metrics.davies_bouldin_score(tracks_features[features], kmeans_clustering.labels_)
print('Davies-Bouldin Score: ', KMeans_DB)

silhouette score:  0.5636480009171122
Calinki-Harabasz Index:  4512.438907017599
Davies-Bouldin Score:  0.5079415437617825


In [135]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
    color=tracks_features.KMeans_cluster.astype(str),
    width=1000, height=800,
    title='K-Means Clustering')

# Mean Shift

In [136]:
from sklearn.cluster import MeanShift

In [137]:
meanshift_clustering = MeanShift().fit(tracks_features[features])

In [138]:
set(meanshift_clustering.labels_)

{0, 1, 2, 3}

In [139]:
px.histogram(x=meanshift_clustering.labels_, title='Distribution of Mean Shift Cluster Points')

In [144]:
sil = metrics.silhouette_score(tracks_features[features], meanshift_clustering.labels_, metric='euclidean')
print('silhouette score: ', sil)

calinksi = metrics.calinski_harabasz_score(tracks_features[features], meanshift_clustering.labels_)
print('Calinki-Harabasz Index: ', calinksi)

DB = metrics.davies_bouldin_score(tracks_features[features], meanshift_clustering.labels_)
print('Davies-Bouldin Score: ', DB)

silhouette score:  0.5580573795504133
Calinki-Harabasz Index:  2026.6563099170378
Davies-Bouldin Score:  0.4874660633865898


In [141]:
#Add clustering to dataframe
tracks_features['meanshift_cluster'] = meanshift_clustering.labels_

In [143]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
    color=tracks_features.meanshift_cluster.astype(str),
    width=1000, height=800,
    title='Mean Shift Clustering')

# Spectral Clustering 

In [146]:
from sklearn.cluster import SpectralClustering

In [150]:
clustering = SpectralClustering(n_clusters=4, assign_labels="discretize").fit(tracks_features[features])

In [151]:
set(clustering.labels_)

{0, 1, 2, 3}

In [152]:
px.histogram(x=clustering.labels_, title='Distribution of Spectral Clustering Points')

In [153]:
sil = metrics.silhouette_score(tracks_features[features], clustering.labels_, metric='euclidean')
print('silhouette score: ', sil)

calinksi = metrics.calinski_harabasz_score(tracks_features[features], clustering.labels_)
print('Calinki-Harabasz Index: ', calinksi)

DB = metrics.davies_bouldin_score(tracks_features[features], clustering.labels_)
print('Davies-Bouldin Score: ', DB)

silhouette score:  0.2726044561363285
Calinki-Harabasz Index:  298.4851415313299
Davies-Bouldin Score:  0.42946166268797953


not great...

In [154]:
#Add clustering to dataframe
tracks_features['spectral_cluster'] = clustering.labels_

In [155]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
    color=tracks_features.spectral_cluster.astype(str),
    width=1000, height=800,
    title='Spectral Clustering')

# Agglomerative Clustering

In [156]:
from sklearn.cluster import AgglomerativeClustering

In [157]:
clustering = AgglomerativeClustering().fit(tracks_features[features])

In [158]:
set(clustering.labels_)

{0, 1}

In [159]:
px.histogram(x=clustering.labels_, title='Distribution of Agglomerative (Ward) Cluster Points')

In [160]:
sil = metrics.silhouette_score(tracks_features[features], clustering.labels_, metric='euclidean')
print('silhouette score: ', sil)

calinksi = metrics.calinski_harabasz_score(tracks_features[features], clustering.labels_)
print('Calinki-Harabasz Index: ', calinksi)

DB = metrics.davies_bouldin_score(tracks_features[features], clustering.labels_)
print('Davies-Bouldin Score: ', DB)

silhouette score:  0.5755327246437776
Calinki-Harabasz Index:  2903.1936708388357
Davies-Bouldin Score:  0.5696561576489922


In [161]:
#Add clustering to dataframe
tracks_features['agglomerative_cluster'] = clustering.labels_

In [162]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
    color=tracks_features.agglomerative_cluster.astype(str),
    width=1000, height=800,
    title='Agglomerative (Ward) Clustering')

# OPTICS

In [163]:
from sklearn.cluster import OPTICS

In [260]:
clustering = OPTICS(metric='minkowski', eps=1, min_samples=40).fit(tracks_features[features])

In [261]:
print(set(clustering.labels_))

{0, 1, 2, 3, 4, 5, 6, 7, 8, -1}


In [262]:
px.histogram(x=clustering.labels_, title='Distribution of OPTICS Cluster Points')

In [266]:
sil = metrics.silhouette_score(tracks_features[features], clustering.labels_, metric='euclidean')
print('silhouette score: ', sil)

calinksi = metrics.calinski_harabasz_score(tracks_features[features], clustering.labels_)
print('Calinki-Harabasz Index: ', calinksi)

DB = metrics.davies_bouldin_score(tracks_features[features], clustering.labels_)
print('Davies-Bouldin Score: ', DB)

silhouette score:  -0.20028354906137302
Calinki-Harabasz Index:  83.03959181892047
Davies-Bouldin Score:  1.6148473581954907


In [263]:
#Add clustering to dataframe
tracks_features['OPTIC_cluster'] = clustering.labels_

In [265]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
    color=tracks_features.OPTIC_cluster.astype(str),
    width=1000, height=800,
    title='OPTIC Clustering')

worse than DBSCAN; highly sensitive to hyperparameters

# Birch

In [267]:
from sklearn.cluster import Birch

In [270]:
clustering = Birch(n_clusters=4).fit(tracks_features[features])

In [271]:
set(clustering.labels_)

{0, 1, 2, 3}

In [272]:
px.histogram(x=clustering.labels_, title='Distribution of Birch Cluster Points')

In [273]:
#Add clustering to dataframe
tracks_features['birch_cluster'] = clustering.labels_

In [274]:
sil = metrics.silhouette_score(tracks_features[features], clustering.labels_, metric='euclidean')
print('silhouette score: ', sil)

calinksi = metrics.calinski_harabasz_score(tracks_features[features], clustering.labels_)
print('Calinki-Harabasz Index: ', calinksi)

DB = metrics.davies_bouldin_score(tracks_features[features], clustering.labels_)
print('Davies-Bouldin Score: ', DB)

silhouette score:  0.5616840041096997
Calinki-Harabasz Index:  4428.0969089122245
Davies-Bouldin Score:  0.5057082735225454


In [275]:
px.scatter_matrix(tracks_features_scaled, dimensions=features, 
    color=tracks_features.birch_cluster.astype(str),
    width=1000, height=800,
    title='Birch Clustering')

## Save Results

In [None]:
tracks_features.to_csv('tracks_features_clusters.csv')