In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import library as lib
from sklearn.cluster import KMeans

In [None]:
# Set the seed for consistent results
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(1)

In [None]:
# Import Dataset
df = lib.import_music_df()
df.head()

In [None]:
#show data columns
df.columns

In [None]:
#drop Unnamed column
df = df.sort_values('Placement').drop_duplicates('Track', keep='last')
attribute_df = df[['danceability', 'energy', 'key', 'loudness', "speechiness", 'acousticness', 'liveness', 'valence', 'tempo']]
attribute_df

## Machine Learning

In [None]:
lists = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(attribute_df)
    lists.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, lists, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

## Without Scaling

In [None]:
# Create a kmeans model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)

# Fit the model to the data
kmeans.fit(attribute_df)

# Use the data to predict the clusters
# save the predictions as `predicted_clusters`
predicted_clusters_no_scale = kmeans.predict(attribute_df)
attribute_df["Cluster"] = predicted_clusters_no_scale

### Cluster Quality

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
# Calculate Silhoutte Score
#
score = silhouette_score(attribute_df, kmeans.labels_)
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)

The Silouetter Score is between -1 to 1. If the value is closer to 1, the clusters are more dense and
and separated from other clusters.

In [None]:
inertia = kmeans.inertia_
print('Inertia: %.3f' % inertia)

The smaller the inertia the denser the cluster

In [None]:
# Plot the clusters
plt.scatter(attribute_df['Cluster'], attribute_df['tempo'], c=attribute_df['Cluster'], s=40, cmap='viridis')
plt.show()

### Save to File

In [None]:
lib.save_model(kmeans, "without-scaling")

## With Scaling

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

attribute_df = attribute_df.drop("Cluster", axis=1)
X_scaled = scaler.fit_transform(attribute_df)
X_scaled_df = pd.DataFrame(scaler.fit_transform(attribute_df), columns=attribute_df.columns, index=attribute_df.index)
X_scaled_df.head()

In [None]:
# Create a kmeans model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)

# Fit the model to the data
kmeans.fit(X_scaled)

# Use the data to predict the clusters
# save the predictions as `predicted_clusters`
predicted_clusters_with_scale = kmeans.predict(X_scaled)
X_scaled_df["Cluster"] = predicted_clusters_with_scale

### Cluster Quality

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
# Calculate Silhoutte Score
#
score = silhouette_score(X_scaled, kmeans.labels_)
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)


The Silouetter Score is between -1 to 1. If the value is closer to 1, the clusters are more dense and
and separated from other clusters.

In [None]:
inertia = kmeans.inertia_
print('Inertia: %.3f' % inertia)

The smaller the inertia the denser the cluster

In [None]:
# Plot the clusters
plt.scatter(X_scaled_df['Cluster'], X_scaled_df['tempo'], c=X_scaled_df['Cluster'], s=40, cmap='viridis')
plt.show()

### Save to File

In [None]:
lib.save_model(kmeans, "with-scaling")