In [142]:
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn_extra.cluster import KMedoids

import plotly.express as px

%matplotlib widget
from mpl_toolkits.mplot3d import Axes3D

## Preprocessing ##

In [66]:
songs_df = pd.read_csv('/Users/jpate/Documents/MySongsDataset.csv')
songs_df

Unnamed: 0,name,artist,genre,id,danceability,energy,loudness,speechiness,acousticness,tempo,liveness,valence
0,"So Long, Goodbye - Live",10 Years,rock,5mIxO1Ez7OWYEaUPAuG2ZU,0.390,0.295,-10.792,0.0316,0.61200,125.194,0.1080,0.120
1,Here Without You,3 Doors Down,rock,135cNW8kQtiTId7qcsJfVC,0.469,0.670,-4.604,0.0270,0.00815,144.045,0.1320,0.158
2,What's Up?,4 Non Blondes,pop,0jWgAnTrNZmOGmqgvHhZEm,0.565,0.564,-10.044,0.0292,0.16100,134.355,0.1140,0.454
3,Ghost Of You,5 Seconds of Summer,pop,1MhXdlCQPnO56T57MfmaRm,0.458,0.561,-4.819,0.0269,0.09650,149.875,0.1560,0.206
4,If It Means a Lot to You,A Day To Remember,post hardcore,6J7cSyvSCnPwv3vqHchEfL,0.584,0.536,-8.158,0.0279,0.11200,126.996,0.4560,0.440
...,...,...,...,...,...,...,...,...,...,...,...,...
811,Chicken Fried,Zac Brown Band,country,4dGJf1SER1T6ooX46vwzRB,0.566,0.713,-4.250,0.0417,0.64500,169.864,0.1140,0.807
812,You Come First (feat. Saweetie),Zak Abel,pop,5Ef2noaxqTAfa5gLVw05OJ,0.745,0.719,-4.291,0.1900,0.03520,99.962,0.0639,0.611
813,PILLOWTALK,ZAYN,pop,0IKK48xF4eEdfofyaeKWWO,0.588,0.702,-4.271,0.0496,0.10400,124.909,0.0890,0.429
814,Seven Nation Army,Zella Day,rock,7fBRH08A6Fu47GMyFamyoO,0.690,0.283,-7.309,0.0294,0.89100,104.348,0.1110,0.350


In [67]:
songs_normalized_df = songs_df.copy()
selected_columns = songs_normalized_df[['danceability',
                                        'energy',
                                        'loudness',
                                        'speechiness',
                                        'acousticness',
                                        'tempo',
                                        'liveness',
                                        'valence']]

In [68]:
selected_columns = StandardScaler().fit_transform(selected_columns)
selected_columns_df = pd.DataFrame(selected_columns)
selected_columns_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.897123,-1.705987,-0.943379,-0.613512,1.467253,-0.006869,-0.585933,-1.385781
1,-0.413799,-0.016898,0.470972,-0.660248,-0.744967,0.596358,-0.433835,-1.209569
2,0.173531,-0.494347,-0.772413,-0.637896,-0.184997,0.286281,-0.547908,0.163034
3,-0.481097,-0.507860,0.421831,-0.661264,-0.421294,0.782916,-0.281736,-0.986985
4,0.289774,-0.620466,-0.341343,-0.651104,-0.364510,0.050795,1.619494,0.098114
...,...,...,...,...,...,...,...,...
811,0.179649,0.176784,0.551884,-0.510896,1.588149,1.422559,-0.547908,1.799956
812,1.274776,0.203809,0.542513,0.995838,-0.645868,-0.814286,-0.865414,0.891070
813,0.314246,0.127237,0.547084,-0.430631,-0.393818,-0.015989,-0.706344,0.047105
814,0.938284,-1.760038,-0.147292,-0.635864,2.489376,-0.673935,-0.566921,-0.319232


In [69]:
normed_features_df = selected_columns_df.rename(columns={0: 'danceability',
                                                         1: 'energy',
                                                         2: 'loudness',
                                                         3:'speechiness',
                                                         4: 'acousticness',
                                                         5: 'tempo',
                                                         6: 'liveness',
                                                         7: 'valence'})

In [70]:
songs_normalized_df = songs_normalized_df.drop(['id',
                                                'danceability',
                                                'energy',
                                                'loudness',
                                                'speechiness',
                                                'acousticness',
                                                'tempo',
                                                'liveness',
                                                'valence'],
                                                   axis=1)

In [71]:
songs_data = songs_normalized_df.join(normed_features_df)

In [8]:
songs_data

Unnamed: 0,name,artist,genre,danceability,energy,loudness,speechiness,acousticness,tempo,liveness,valence
0,"So Long, Goodbye - Live",10 Years,rock,-0.897123,-1.705987,-0.943379,-0.613512,1.467253,-0.006869,-0.585933,-1.385781
1,Here Without You,3 Doors Down,rock,-0.413799,-0.016898,0.470972,-0.660248,-0.744967,0.596358,-0.433835,-1.209569
2,What's Up?,4 Non Blondes,pop,0.173531,-0.494347,-0.772413,-0.637896,-0.184997,0.286281,-0.547908,0.163034
3,Ghost Of You,5 Seconds of Summer,pop,-0.481097,-0.507860,0.421831,-0.661264,-0.421294,0.782916,-0.281736,-0.986985
4,If It Means a Lot to You,A Day To Remember,post hardcore,0.289774,-0.620466,-0.341343,-0.651104,-0.364510,0.050795,1.619494,0.098114
...,...,...,...,...,...,...,...,...,...,...,...
811,Chicken Fried,Zac Brown Band,country,0.179649,0.176784,0.551884,-0.510896,1.588149,1.422559,-0.547908,1.799956
812,You Come First (feat. Saweetie),Zak Abel,pop,1.274776,0.203809,0.542513,0.995838,-0.645868,-0.814286,-0.865414,0.891070
813,PILLOWTALK,ZAYN,pop,0.314246,0.127237,0.547084,-0.430631,-0.393818,-0.015989,-0.706344,0.047105
814,Seven Nation Army,Zella Day,rock,0.938284,-1.760038,-0.147292,-0.635864,2.489376,-0.673935,-0.566921,-0.319232


## Loading and Modeling the Data ##

In [72]:
X = songs_data[['danceability',
                'energy',
                'loudness',
                'speechiness',
                'acousticness',
                'tempo',
                'liveness',
                'valence']]

In [73]:
pca = PCA(n_components=3)
pca_mdl = pca.fit_transform(X)
pca_df = pd.DataFrame(pca_mdl)
pca_df

Unnamed: 0,0,1,2
0,2.602736,-1.492037,0.158723
1,-0.478138,-1.526152,-0.370728
2,0.607138,-0.081452,-0.582575
3,-0.103425,-1.401394,-0.192039
4,0.155264,0.266602,0.644151
...,...,...,...
811,-0.112429,0.708473,-0.595045
812,-0.638217,1.635528,-0.987314
813,-0.439575,-0.142942,-0.988710
814,2.755775,0.396043,-0.870385


In [186]:
kmeans = KMeans(n_clusters=5, random_state=1).fit(X)

In [187]:
y = kmeans.fit_predict(X)

In [188]:
y_df = pd.DataFrame(y, columns=['Cluster'])
new_df = pd.concat([songs_data, y_df], axis=1)
new_df

Unnamed: 0,name,artist,genre,danceability,energy,loudness,speechiness,acousticness,tempo,liveness,valence,Cluster
0,"So Long, Goodbye - Live",10 Years,rock,-0.897123,-1.705987,-0.943379,-0.613512,1.467253,-0.006869,-0.585933,-1.385781,1
1,Here Without You,3 Doors Down,rock,-0.413799,-0.016898,0.470972,-0.660248,-0.744967,0.596358,-0.433835,-1.209569,0
2,What's Up?,4 Non Blondes,pop,0.173531,-0.494347,-0.772413,-0.637896,-0.184997,0.286281,-0.547908,0.163034,4
3,Ghost Of You,5 Seconds of Summer,pop,-0.481097,-0.507860,0.421831,-0.661264,-0.421294,0.782916,-0.281736,-0.986985,0
4,If It Means a Lot to You,A Day To Remember,post hardcore,0.289774,-0.620466,-0.341343,-0.651104,-0.364510,0.050795,1.619494,0.098114,4
...,...,...,...,...,...,...,...,...,...,...,...,...
811,Chicken Fried,Zac Brown Band,country,0.179649,0.176784,0.551884,-0.510896,1.588149,1.422559,-0.547908,1.799956,4
812,You Come First (feat. Saweetie),Zak Abel,pop,1.274776,0.203809,0.542513,0.995838,-0.645868,-0.814286,-0.865414,0.891070,4
813,PILLOWTALK,ZAYN,pop,0.314246,0.127237,0.547084,-0.430631,-0.393818,-0.015989,-0.706344,0.047105,4
814,Seven Nation Army,Zella Day,rock,0.938284,-1.760038,-0.147292,-0.635864,2.489376,-0.673935,-0.566921,-0.319232,1


In [190]:
mdl_clstr_0 = new_df[new_df.Cluster == 0]

In [None]:
clstr_0 = pd.concat([mdl_clstr_0, pca_df], axis = 1)
clstr_0 = clstr_0.dropna()
clstr_0

In [None]:
mdl_clstr_1 = new_df[new_df.Cluster == 1]

In [None]:
clstr_1 = pd.concat([mdl_clstr_1, pca_df], axis = 1)
clstr_1 = clstr_1.dropna()
clstr_1

In [192]:
mdl_clstr_2 = new_df[new_df.Cluster == 2]

In [None]:
clstr_2 = pd.concat([mdl_clstr_2, pca_df], axis = 1)
clstr_2 = clstr_2.dropna()
clstr_2

In [243]:
mdl_clstr_3 = new_df[new_df.Cluster == 3]

In [None]:
clstr_3 = pd.concat([mdl_clstr_3, pca_df], axis = 1)
clstr_3 = clstr_3.dropna()
clstr_3

In [194]:
mdl_clstr_4 = new_df[new_df.Cluster == 4]

In [None]:
clstr_4 = pd.concat([mdl_clstr_4, pca_df], axis = 1)
clstr_4 = clstr_4.dropna()
clstr_4

## The Final Plot ##

In [242]:
fig = px.scatter_3d(pd.concat([new_df, pca_df], axis = 1), 
                 x = 0, y = 1, z=2, color='Cluster', size_max=0.01, hover_data=['name','artist','genre'])

fig.update_traces(marker=dict(size=5,
                              line=dict(width=1,
                                        color='black')),
                  selector=dict(mode='markers'))

fig.show()

## Individual Cluster Plots ##

In [246]:
#you can change the data_frame = value with the following: clstr_0, clstr_1, clstr_2, clstr_3, clster_4

fig = px.scatter_3d(data_frame=clstr_1, 
                 x = 0, y = 1, z=2, color='Cluster', size_max=0.01, hover_data=['name','artist','genre'], 
                    range_x=(-2,6), range_y=(-2,4), range_z=(-2,5))

fig.update_traces(marker=dict(size=5,
                              line=dict(width=1,
                                        color='cyan')),
                  selector=dict(mode='markers'))

fig.show()