# K means Clustering Analysis

In [221]:
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
warnings.filterwarnings("ignore")

In [222]:
df = pd.read_csv('Data/Listening_Data/songs_listened_to_with_all_info_unique.csv')
df = df.drop('Unnamed: 0.1', axis=1)
df = df.drop('Unnamed: 0', axis = 1)
df = df.drop_duplicates(subset='trackName', keep='first')
df.head(2)

Unnamed: 0,endTime,artistName,trackName,albumName,msPlayed,month,year,Season,trackUri,danceability,...,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,1/20/2022 4:50,Lonr.,Make the Most (feat. H.E.R.),Land Of Nothing Real,60,1,2022,1,spotify:track:6UF1vUIoLleFrhcxg2L26V,0.566,...,0.0806,0.838,78.743,audio_features,6UF1vUIoLleFrhcxg2L26V,spotify:track:6UF1vUIoLleFrhcxg2L26V,https://api.spotify.com/v1/tracks/6UF1vUIoLleF...,https://api.spotify.com/v1/audio-analysis/6UF1...,213851,4
73,1/29/2022 20:28,Yung Bleu,You’re Mines Still (feat. Drake),You’re Mines Still (feat. Drake),5739,1,2022,1,spotify:track:7E2C5rBLpCKwQlhJPVFBRS,0.575,...,0.105,0.339,83.004,audio_features,7E2C5rBLpCKwQlhJPVFBRS,spotify:track:7E2C5rBLpCKwQlhJPVFBRS,https://api.spotify.com/v1/tracks/7E2C5rBLpCKw...,https://api.spotify.com/v1/audio-analysis/7E2C...,226220,4


In [223]:
# Get a metric for percentage played of each song.
df['Percentage_Listened'] = (df['msPlayed'] / df['duration_ms']) * 100

In [224]:
df_stats = df[['danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]
df_stats.head(2)

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.566,0.693,10,-6.917,0.368,0.361,0.0,0.0806,0.838,78.743
73,0.575,0.609,5,-4.88,0.121,0.317,0.0,0.105,0.339,83.004


In [225]:
scaler = MinMaxScaler()
scaler.fit(df_stats)
X=scaler.transform(df_stats)


In [226]:
inertia = []
for i in range(1,11):
    kmeans = KMeans(
        n_clusters=i, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
fig = go.Figure(data=go.Scatter(x=np.arange(1,11),y=inertia))
fig.update_layout(title="Inertia vs Cluster Number",xaxis=dict(range=[0,11],title="Cluster Number"),
                  yaxis={'title':'Inertia'},
                 annotations=[
        dict(
            x=4,
            y=inertia[3],
            xref="x",
            yref="y",
            text="Elbow",
            showarrow=True,
            arrowhead=7,
            ax=20,
            ay=-40
        )
    ])

Looks like the elbow is created around K = 3-4. So lets use 4!

Note: Inertia can be recognized as a measure of how internally coherent clusters are.

Inertia measures how well a dataset was clustered by K-Means. It is calculated by measuring the distance between each data point and its centroid, squaring this distance, and summing these squares across one cluster. A good model is one with low inertia AND a low number of clusters ( K )

In [227]:
# initialize can get stuck
kmeans = KMeans(
        n_clusters=4, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
kmeans.fit(X)

KMeans(n_clusters=4, random_state=42)

In [228]:
clusters=pd.DataFrame(X,columns=df_stats.columns)
clusters['label']=kmeans.labels_
polar=clusters.groupby("label").mean().reset_index()
polar=pd.melt(polar,id_vars=["label"])
fig = px.line_polar(polar, r="value", theta="variable", color="label", line_close=True,height=800,width=1400)
fig.show()

In [229]:
pie=clusters.groupby('label').size().reset_index()
pie.columns=['label','value']
px.pie(pie,values='value',names='label')

In [230]:
clustered_labels = clusters.groupby('label', as_index = False).mean()
clustered_labels

Unnamed: 0,label,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0,0.689748,0.597381,0.744694,0.818129,0.164643,0.125016,0.011099,0.161295,0.310898,0.405469
1,1,0.72166,0.698658,0.678138,0.84177,0.171314,0.164181,0.008011,0.186237,0.693888,0.431464
2,2,0.720847,0.642418,0.121301,0.829175,0.173697,0.155632,0.009124,0.170445,0.465891,0.409986
3,3,0.599492,0.392282,0.523848,0.717117,0.121421,0.689671,0.148498,0.131504,0.345786,0.364087


In [234]:
clusters.columns

Index(['danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'label'],
      dtype='object')

In [253]:
# get a most similar to value for each of the classifications 

# classification 1
from sklearn.metrics.pairwise import cosine_similarity
similar_arr = cosine_similarity(clusters, clustered_labels.head(1))
# index 1
indexes = np.argsort(similar_arr.T[0])[::-1][:5]
df.iloc[indexes].to_csv('Data/Clustering_Data/ClusterGroup1.csv')
df.iloc[indexes][['trackName', 'artistName', 'albumName', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',]]

Unnamed: 0,trackName,artistName,albumName,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
4283,RNP (feat. Anderson .Paak),Cordae,The Lost Boy,0.611,0.714,9,-4.696,0.856,0.184,0.0,0.145,0.628,196.043
10684,Real Friends,Kanye West,The Life Of Pablo,0.441,0.744,6,-6.248,0.476,0.262,0.000382,0.138,0.124,81.396
10565,Anti,SOB X RBE,SOB X RBE,0.702,0.785,7,-6.842,0.743,0.321,0.0,0.0682,0.647,132.857
4124,Ambition (feat. Meek Mill & Rick Ross),Wale,Ambition,0.42,0.744,8,-5.033,0.396,0.398,9e-06,0.192,0.324,161.792
1706,You,Jacquees,Street Love,0.412,0.876,5,-6.008,0.47,0.102,0.0,0.297,0.703,130.159


In [254]:
similar_arr = cosine_similarity(clusters, clustered_labels.loc[1].values.reshape(1, -1))
indexes = np.argsort(similar_arr.T[0])[::-1][:5]
df.iloc[indexes].to_csv('Data/Clustering_Data/ClusterGroup2.csv')
df.iloc[indexes][['trackName', 'artistName', 'albumName', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',]]

Unnamed: 0,trackName,artistName,albumName,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
5013,Hot N*gga,Bobby Shmurda,Hot N*gga,0.794,0.51,9,-7.314,0.42,0.0505,2e-06,0.0562,0.18,167.879
10705,"Slime Shit (feat. Yak Gotti, Duke & Peewee Ros...",Young Thug,Slime Season 3,0.896,0.423,8,-7.987,0.44,0.02,0.0,0.182,0.282,132.928
11782,Jackass,Mike Sherm,Chasin Chicken,0.941,0.637,9,-8.187,0.487,0.139,0.0,0.0617,0.32,97.036
11217,Shotta Flow,NLE Choppa,Shotta Flow,0.888,0.43,9,-6.728,0.588,0.0786,2e-06,0.132,0.45,120.088
8392,Gucci Flip Flops (feat. Lil Yachty),Bhad Bhabie,15,0.82,0.603,8,-7.18,0.308,0.0351,0.0159,0.122,0.177,159.976


In [255]:
similar_arr = cosine_similarity(clusters, clustered_labels.loc[2].values.reshape(1, -1))
indexes = np.argsort(similar_arr.T[0])[::-1][:5]
df.iloc[indexes].to_csv('Data/Clustering_Data/ClusterGroup3.csv')
df.iloc[indexes][['trackName', 'artistName', 'albumName', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',]]

Unnamed: 0,trackName,artistName,albumName,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
4413,That Nigga (feat. Jay Ant),Problem,Million Dollar Afro (Deluxe Edition),0.884,0.456,7,-10.105,0.356,0.00538,0.0,0.0911,0.292,97.041
11782,Jackass,Mike Sherm,Chasin Chicken,0.941,0.637,9,-8.187,0.487,0.139,0.0,0.0617,0.32,97.036
12412,OneFineBiddy,Chef E,OneFineBiddy,0.942,0.473,6,-10.292,0.211,0.14,0.0,0.0649,0.423,100.974
11193,What's the Move (feat. Lil Uzi Vert),Young Thug,So Much Fun,0.961,0.438,7,-7.877,0.31,0.0431,0.0,0.139,0.412,124.018
10705,"Slime Shit (feat. Yak Gotti, Duke & Peewee Ros...",Young Thug,Slime Season 3,0.896,0.423,8,-7.987,0.44,0.02,0.0,0.182,0.282,132.928


In [256]:
similar_arr = cosine_similarity(clusters, clustered_labels.loc[3].values.reshape(1, -1))
indexes = np.argsort(similar_arr.T[0])[::-1][:5]
df.iloc[indexes].to_csv('Data/Clustering_Data/ClusterGroup4.csv')
df.iloc[indexes][['trackName', 'artistName', 'albumName', 'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',]]

Unnamed: 0,trackName,artistName,albumName,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
12412,OneFineBiddy,Chef E,OneFineBiddy,0.942,0.473,6,-10.292,0.211,0.14,0.0,0.0649,0.423,100.974
6583,Beautiful Escape,Tom Misch,Beat Tape 2,0.944,0.514,5,-7.895,0.195,0.0684,0.00209,0.103,0.331,120.004
10710,Chill Bill (feat. J. Davi$ & Spooks),Rob $tone,Chill Bill (feat. J. Davi$ & Spooks),0.886,0.427,6,-10.028,0.145,0.0312,0.00099,0.0906,0.23,108.034
4413,That Nigga (feat. Jay Ant),Problem,Million Dollar Afro (Deluxe Edition),0.884,0.456,7,-10.105,0.356,0.00538,0.0,0.0911,0.292,97.041
13793,Outstanding,MadeinTYO,"Sincerely, Tokyo",0.943,0.461,5,-6.066,0.115,0.025,0.0,0.0846,0.149,130.002


In [None]:
for label in clusters['label'].unique():
    clusters[clusters['label'] == label].to_csv('Data/Clustering_Data/' + str(label) + '.csv')

In [None]:
clusters.columns

Index(['danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'label', 'artistName', 'trackName', 'Season', 'month', 'endTime'],
      dtype='object')

In [None]:

fig = px.scatter(clusters.sort_values('label'), x = 'artistName', y = 'trackName',color='label', title='4-means clustered groups of songs from my spotify')
fig.show()
fig.write_html('clusteredgroups.html')
