In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# import seaborn as sns
from sklearn.cluster import KMeans

## Extract

In [None]:
#Import Dataset
df=pd.read_csv('../data/external/billboard_hits.csv')

## Transform

In [None]:
#show data columns
df.columns

In [None]:
#drop Unnamed column
# df = df.drop(['Unnamed: 0'],axis=1)
# df_clean = df.sort_values('Placement').drop_duplicates('Track', keep='last')
# df['Decade'] = (10 * (df['Year'] // 10)).astype(str) + 's'
attribute_df = df[['danceability', 'energy', 'key', 'loudness', "speechiness", 'acousticness', 'liveness', 'valence', 'tempo']]
attribute_df

## Machine Learning

In [None]:
# X = sixty_df_clean[['energy']]
# y = sixty_df_clean[['loudness']]

In [None]:
lists = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(attribute_df)
    lists.append(kmeanModel.inertia_)

In [None]:
plt.figure(figsize=(16,8))
plt.plot(K, lists, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

## Without Scaling

In [None]:
# Create a kmeans model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)

# Fit the model to the data
kmeans.fit(attribute_df)

# Use the data to predict the clusters
# save the predictions as `predicted_clusters`
predicted_clusters = kmeans.predict(attribute_df)
attribute_df["Cluster"] = predicted_clusters

## Cluster Quality

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
# Calculate Silhoutte Score
#
score = silhouette_score(attribute_df, kmeans.labels_)
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)

The Silouetter Score is between -1 to 1. If the value is closer to 1, the clusters are more dense and
and separated from other clusters.

In [None]:
inertia = kmeans.inertia_
print('Inertia: %.3f' % inertia)

The smaller the inertia the denser the cluster

In [None]:
# Plot the clusters
plt.scatter(attribute_df['Cluster'], attribute_df['tempo'], c=attribute_df['Cluster'], s=40, cmap='viridis')
plt.show()

## With Scaling

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(attribute_df)
X_scaled_df = pd.DataFrame(scaler.fit_transform(attribute_df), columns=attribute_df.columns, index=attribute_df.index)


In [None]:
# Create a kmeans model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)

# Fit the model to the data
kmeans.fit(X_scaled)

# Use the data to predict the clusters
# save the predictions as `predicted_clusters`
predicted_clusters = kmeans.predict(X_scaled)
X_scaled_df["Cluster"] = predicted_clusters

## Cluster Quality

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
# Calculate Silhoutte Score
#
score = silhouette_score(X_scaled, kmeans.labels_)
#
# Print the score
#
print('Silhouetter Score: %.3f' % score)


The Silouetter Score is between -1 to 1. If the value is closer to 1, the clusters are more dense and
and separated from other clusters.

In [None]:
inertia = kmeans.inertia_
print('Inertia: %.3f' % inertia)

The smaller the inertia the denser the cluster

In [None]:
# Plot the clusters
plt.scatter(X_scaled_df['Cluster'], X_scaled_df['tempo'], c=X_scaled_df['Cluster'], s=40, cmap='viridis')
plt.show()

## Full Dataframe

In [None]:
df["Cluster"] = predicted_clusters
X_scaled_df["Placement"] = df["Placement"]

## Linear Regression

In [None]:
# Assign the data to X and y

cluster_1_df = df[(attribute_df['Cluster'] == 1)]

y = cluster_1_df[['tempo']]
X = cluster_1_df[['Year']]

plt.scatter(X, y)

## Multi Linear Regression

In [None]:
X = cluster_1_df[['Placement']]
y = cluster_1_df[['tempo']]
z = cluster_1_df[['Cluster']]

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(5, 5))
axes = Axes3D(fig, elev=20, azim=45)
axes.scatter(X, y, z, c=z, cmap=plt.cm.get_cmap("Spectral"))
plt.show()

## Top Songs

In [None]:
top_20 = X_scaled_df[X_scaled_df['Placement'].between(1, 20)]
top_40 = df[df['Placement'].between(21, 40)]
top_60 = df[df['Placement'].between(41, 60)]
top_80 = df[df['Placement'].between(61, 80)]
top_100 = df[df['Placement'].between(81, 100)]

In [None]:
# Plot the clusters
plt.scatter(top_20['Cluster'], top_20['tempo'], c=top_20['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
# Plot the clusters
plt.scatter(top_40['Cluster'], top_40['tempo'], c=top_40['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
# Plot the clusters
plt.scatter(top_60['Cluster'], top_60['tempo'], c=top_60['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
# Plot the clusters
plt.scatter(top_80['Cluster'], top_80['tempo'], c=top_80['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
## Decades
df['Decade'] = (10 * (df['Year'] // 10)).astype(str) + 's'

In [None]:
# decades dataframe only take highest placement
sixty_df = df[(df['Decade'] == '1960s')]
seventy_df = df[(df['Decade'] == '1970s')]
eighty_df = df[(df['Decade'] == '1980s')]
ninety_df = df[(df['Decade'] == '1990s')]
twoth_df = df[(df['Decade'] == '2000s')]
twotn_df = df[(df['Decade'] == '2010s')]

In [None]:
plt.scatter(sixty_df['Cluster'], sixty_df['loudness'], c=sixty_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(seventy_df['Cluster'], seventy_df['loudness'], c=seventy_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(eighty_df['Cluster'], eighty_df['loudness'], c=eighty_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(ninety_df['Cluster'], ninety_df['loudness'], c=ninety_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(twoth_df['Cluster'], twoth_df['loudness'], c=twoth_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(twotn_df['Cluster'], twotn_df['loudness'], c=twotn_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(sixty_df['Cluster'], sixty_df['speechiness'], c=sixty_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(seventy_df['Cluster'], seventy_df['speechiness'], c=seventy_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(eighty_df['Cluster'], eighty_df['speechiness'], c=eighty_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(ninety_df['Cluster'], ninety_df['speechiness'], c=ninety_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(twoth_df['Cluster'], twoth_df['speechiness'], c=twoth_df['Cluster'], s=40, cmap='viridis')
plt.show()

In [None]:
plt.scatter(twotn_df['Cluster'], twotn_df['speechiness'], c=twotn_df['Cluster'], s=40, cmap='viridis')
plt.show()