# K-Means algorithm

In [None]:
#pip install scikit-learn --upgrade

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Exploring 1000 song dataset

In [None]:
df_1000 = pd.read_csv(r'df_audio_features_1000.csv')

In [None]:
df_1000.drop(['type','id','html'],axis=1, inplace=True)

In [None]:
df_1000.set_index('name', inplace=True)

In [None]:
df_1000

In [None]:
df1000songs=df_1000[['energy','tempo']]
df1000songs.head()

 ## 2. K-Means on unscaled Dataframe:
I would suggest 5 n_clusters for 1000 songs here, since our "bosses" tell us to have like 200-250 songs per cluster.  
With the larger dataset we can work with 250 songs. Read LMS 6.5 and 6.6 for deeper understanding.  

In [None]:
# To get used to the "raw numbers":
df1000songs.describe()

In [None]:
# Simple plot of maybe related numbers:
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=df1000songs, x='tempo', y='energy')
plt.title('Looking for a relation between tempo and energy')
plt.show()

In [None]:
# Applying KMean on it:
kmean_test = df1000songs[['energy','tempo']]
test_model = KMeans(n_clusters=5)
test_model.fit(kmean_test)
test_model.cluster_centers_
# Making another df out of it:
model_centroids = pd.DataFrame(test_model.cluster_centers_)
model_centroids.columns = ['energy','tempo']
model_centroids

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

plt.title('Comparing our centroids and our dataset')
plt.xlabel('tempo')
plt.ylabel('energy')
sns.scatterplot(data=kmean_test, x='tempo', y='energy')
sns.scatterplot(data=model_centroids, x='tempo', y='energy', color='red', s=250)
plt.show()

 ## 2. K-Means on scaled Dataframe:

In [None]:
df1000songs.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler
my_scaler = MinMaxScaler().set_output(transform='pandas')
df1000songs_scaled = my_scaler.fit_transform(df1000songs)
# let's see the first 5 observations after normalising the data
df1000songs_scaled[:5]

In [None]:
df1000songs_scaled.describe()

# K-means

In [None]:
from sklearn.cluster import KMeans

# Based on the quick look to the scatter plot, 
# we are going to assume there can be 5 different species.
my_model = KMeans(n_clusters=5)
my_model.fit(df1000songs_scaled)

In [None]:
my_model.cluster_centers_

In [None]:
df1000songs_mod = pd.DataFrame(my_model.cluster_centers_)
df1000songs_mod.columns= ['energy','tempo']
df1000songs_mod

# Explore our KMeans results

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
plot_data = pd.DataFrame(df1000songs_scaled)
plot_data.columns = ['energy','tempo']

plt.title('Comparing our centroids and our dataset')
plt.xlabel('tempo normalised')
plt.ylabel('energy normalised')
sns.scatterplot(data=plot_data, x='tempo', y='energy')
sns.scatterplot(data=df1000songs_mod, x='tempo', y='energy', color='black', s=250)
plt.show()

In [None]:
my_model.labels_

# # Adding column clusters

In [None]:
df1000songs_scaled['cluster'] = my_model.labels_
df1000songs_scaled

In [None]:
# create plot dataframe
# plot_data = pd.DataFrame(df1000songs_scaled)
# plot_data.columns = ['energy','tempo']
# plot_data['cluster'] = my_model.labels_

# size and plot titles
fig, ax = plt.subplots(figsize=(10, 8))
plt.title('Visualising clusters')
plt.xlabel('tempo normalised')
plt.ylabel('energy normalised')          

# scatter plot
sns.scatterplot(data=df1000songs_scaled, x='tempo', y='energy',hue='cluster', palette='Set2', s=75)
sns.scatterplot(data=df1000songs_mod, x='tempo', y='energy', color='black', s=100)
plt.show()