In [None]:
#pip install scikit-learn --upgrade

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
from sklearn import set_config
set_config(transform_output="pandas")

# Exploring 1000 song dataset

In [None]:
df_1000 = pd.read_csv(r'df_audio_features_1000.csv')

In [None]:
df_1000 ['title'] = df_1000['name'] + ' - ' + df_1000['artist']

In [None]:
df_1000.set_index('title', inplace=True)

In [None]:
df_1000.drop(['type','id','html','name','artist'],axis=1, inplace=True)

In [None]:
df_1000

In [None]:
df1000songs=df_1000.copy()
df1000songs

# Scale data
I would suggest 5 n_clusters for 1000 songs here, since our "bosses" tell us to have like 200-250 songs per cluster.  
With the larger dataset we can work with 250 songs. Read LMS 6.5 and 6.6 for deeper understanding.  

In [None]:
# To get used to the "raw numbers":
df1000songs.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

my_scaler = MinMaxScaler()

scaled_df = my_scaler.fit_transform(df1000songs)

In [None]:
scaled_df

# Cluster creation

In [None]:
from sklearn.cluster import KMeans

my_model = KMeans(n_clusters=5, random_state=123)

my_model.fit(scaled_df)

# Adding column clusters

In [None]:
scaled_df_without_clusters = scaled_df.copy()

In [None]:
scaled_df['cluster']  = my_model.labels_

In [None]:
scaled_df.sample(5)

# Exploring the mean of data

In [None]:
clustered_centroids = scaled_df.groupby('cluster').mean()
clustered_centroids

# Comparing clusters

In [None]:
from sklearn.metrics import pairwise_distances

distances_centroids = pairwise_distances(my_model.cluster_centers_)
sns.heatmap(distances_centroids)

# Making radar plot

In [None]:
from math import pi
radar_df = clustered_centroids.reset_index().rename(columns={'index':'cluster'})

# ------- PART 1: Create background
 
# number of variable
categories= radar_df.columns.tolist()[1:]
# list(df)[1:]
N = len(categories)

# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Initialise the spider plot
ax = plt.subplot(111, polar=True)

# If you want the first axis to be on top:
ax.set_theta_offset(pi / 2)
ax.set_theta_direction(-1)

# Draw one axe per variable + add labels
plt.xticks(angles[:-1], categories)

# ------- PART 2: Add plots
 
# Plot each individual = each line of the data
# I don't make a loop, because plotting more than 3 groups makes the chart unreadable

# Ind1
values=radar_df.loc[0].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label="Cluster 0")
ax.fill(angles, values, 'b', alpha=0.1)

# Ind2
values=radar_df.loc[1].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label="Cluster 1")
ax.fill(angles, values, 'r', alpha=0.1)

# Ind3
values=radar_df.loc[2].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label="Cluster 2")
ax.fill(angles, values, 'b', alpha=0.1)

# Ind4
values=radar_df.loc[3].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label="Cluster 3")
ax.fill(angles, values, 'b', alpha=0.1)

# Ind3
values=radar_df.loc[4].drop('cluster').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle=None, label="Cluster 4")
ax.fill(angles, values, 'b', alpha=0.1)

# # Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))


# Show the graph
plt.show()

# Calculating inertia

A for loop to compute the clustering for each value and append the inertia into a list should do the trick. We will use the range() function to generate a list of values between 1 and 30, and input this value to the n_clusters in each iteration:

In [None]:
max_clusters = 30

inertia_list = []

for i in range(1,max_clusters):
  model_loop = KMeans(n_clusters=i,n_init=10,random_state=123)
  model_loop.fit(scaled_df_without_clusters)
  inertia_list.append(round(model_loop.inertia_,3))

In [None]:
print(inertia_list)

In [None]:
sns.set_theme(style='darkgrid')
(sns.relplot(kind='line',x=range(1,max_clusters),y=inertia_list,marker='o',height=6,aspect=2)
    .set(title=f"Inertia score from a to {max_clusters} clusters")
    .set_axis_labels("Number of clusters", "Inertia score")
);

In this line plot, with the number of clusters (k) represented in the x axis and the inertia represented in the y axis, we are looking for an elbow, as in the moment where the decline stops being sharp to become smooth and gradual:

One might argue that the elbow happens at k=4 or at k=5. It is a pity, since we are looking for a good value between 6 and 30. 

# Silhouette score

To compute the silhouette score, you can use Scikit-Learn’s silhouette_score() function, giving it all the instances in the dataset, and the labels they were assigned:

In [None]:
silhouette_score(scaled_df_without_clusters,my_model.labels_)

In [None]:
max_clusters = 29

silhouette_list = []

for i in range(2,max_clusters):
  model_loop = KMeans(n_clusters=i,n_init=10,random_state=123)
  model_loop.fit(scaled_df_without_clusters)
  score = silhouette_score(scaled_df_without_clusters,model_loop.labels_)
  silhouette_list.append(round(score,2))

If you were able to plot the inertia values for all values of K from 1 to 30, you should be able to do the same for the silhouette coefficients. Here, you will not be looking for elbows, but for local maxima.

In [None]:
sns.set_theme(style='darkgrid')
(sns.relplot(kind='line',x=range(2,max_clusters),y=silhouette_list,marker='o',height=6,aspect=2)
    .set(title=f"Silhouette score from a to {max_clusters} clusters")
    .set_axis_labels("Number of clusters", "Sillhouette score")
);