In [2]:
# import and settings
import pandas as pd
import numpy as np

# import sklearn
from sklearn.decomposition import PCA
# set in and outputs to pandas dataframes
from sklearn import set_config
set_config(transform_output="pandas")  

# import functions 
from functions_ML import *
from functions_ML import clustering_n_dim
# set colwidth to 100
pd.set_option("display.max_colwidth", 100)

In [3]:
# %matplotlib inline   # in case of ploblem with plt.show() within a function

# Import and some Cleaning

In [5]:
# get the 5000 songs
df = pd.read_csv('3_spotify_5000_songs.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,name,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,time_signature,id,html
0,0,Se Eu Quiser Falar Com Deus ...,Gilberto Gil,0.658,0.259,11,-13.141,0,0.0705,0.694,5.9e-05,0.975,0.306,110.376,,256213,4,1n7JnwviZ7zf0LR1tcGFq7,https://open.spotify.com/track/1n7JnwviZ7zf0LR1tcGFq7
1,1,Saudade De Bahia ...,Antônio Carlos Jobim,0.742,0.399,2,-12.646,1,0.0346,0.217,2e-06,0.107,0.693,125.039,,191867,4,5QGM1U0eCYrQuwSJwTm5Zq,https://open.spotify.com/track/5QGM1U0eCYrQuwSJwTm5Zq
2,2,"Canta Canta, Minha Gente ...",Martinho Da Vila,0.851,0.73,2,-11.048,1,0.347,0.453,6.3e-05,0.124,0.905,93.698,,152267,4,0NLIFSZxPzQhCwnkn5PJYs,https://open.spotify.com/track/0NLIFSZxPzQhCwnkn5PJYs
3,3,Mulher Eu Sei ...,Chico César,0.705,0.0502,4,-18.115,1,0.0471,0.879,4.1e-05,0.386,0.524,106.802,,186227,4,3mXqOdlLE1k67WsAxryPFs,https://open.spotify.com/track/3mXqOdlLE1k67WsAxryPFs
4,4,Rosa Morena ...,Kurt Elling,0.651,0.119,6,-19.807,1,0.038,0.916,0.000343,0.104,0.402,120.941,,273680,4,7bSzjzjTkWT2CkIPPdp0eA,https://open.spotify.com/track/7bSzjzjTkWT2CkIPPdp0eA


In [6]:
# clean column names
df = df.rename(mapper = str.strip, axis = 'columns').copy()
# introduce song/title column for using as index
df['song/artist'] = df['name'].str.strip()+ ' - ' + df['artist'].str.strip() 
df = df.set_index(['song/artist'])  

In [7]:
# drop columns which do not represent different music types
to_drop = ['Unnamed: 0','name','artist', 'mode', 'duration_ms', 'time_signature','type','id','html']
songs_df = df.drop(to_drop, axis=1).copy()

# remaining part for inserting url later
remaining = df[['id','html']].copy()

songs_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Se Eu Quiser Falar Com Deus - Gilberto Gil,0.658,0.259,11,-13.141,0.0705,0.694,5.9e-05,0.975,0.306,110.376
Saudade De Bahia - Antônio Carlos Jobim,0.742,0.399,2,-12.646,0.0346,0.217,2e-06,0.107,0.693,125.039
"Canta Canta, Minha Gente - Martinho Da Vila",0.851,0.73,2,-11.048,0.347,0.453,6.3e-05,0.124,0.905,93.698
Mulher Eu Sei - Chico César,0.705,0.0502,4,-18.115,0.0471,0.879,4.1e-05,0.386,0.524,106.802
Rosa Morena - Kurt Elling,0.651,0.119,6,-19.807,0.038,0.916,0.000343,0.104,0.402,120.941


# Exploration

In [9]:
songs_df.info() # all rows have integer or float values

<class 'pandas.core.frame.DataFrame'>
Index: 5235 entries, Se Eu Quiser Falar Com Deus - Gilberto Gil to Ravel: Boléro, M. 81 - Maurice Ravel
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      5235 non-null   float64
 1   energy            5235 non-null   float64
 2   key               5235 non-null   int64  
 3   loudness          5235 non-null   float64
 4   speechiness       5235 non-null   float64
 5   acousticness      5235 non-null   float64
 6   instrumentalness  5235 non-null   float64
 7   liveness          5235 non-null   float64
 8   valence           5235 non-null   float64
 9   tempo             5235 non-null   float64
dtypes: float64(9), int64(1)
memory usage: 449.9+ KB


In [10]:
# clean column names
songs_df.nunique()

danceability         882
energy              1191
key                   12
loudness            4310
speechiness         1001
acousticness        2545
instrumentalness    2168
liveness            1128
valence             1267
tempo               4824
dtype: int64

In [11]:
#install Profile report
from ydata_profiling import ProfileReport
# Then use the function on the data frame you want information about
report = ProfileReport(songs_df)
report.to_file('report5000_reduced.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# Scaling
preprocesssing: elaborate data with a **robust scaler** as the metrics have different scales<br>using RobustScaler() from sklearn.preprocessing

In [13]:
# use min max scaler
songs_df_scaled = scaling(songs_df,'robust')
songs_df_scaled.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Se Eu Quiser Falar Com Deus - Gilberto Gil,0.313984,-1.157005,1.0,-0.955541,0.246438,1.063138,-0.002833,5.882353,-0.230769,-0.232633
Saudade De Bahia - Antônio Carlos Jobim,0.53562,-0.818841,-0.5,-0.874033,-0.355407,0.215967,-0.002921,-0.124567,0.573805,0.154816
"Canta Canta, Minha Gente - Martinho Da Vila",0.823219,-0.019324,-0.5,-0.610901,4.881811,0.635112,-0.002825,-0.00692,1.014553,-0.673325
Mulher Eu Sei - Chico César,0.437995,-1.661353,-0.166667,-1.774576,-0.145851,1.391706,-0.002861,1.806228,0.222453,-0.327071
Rosa Morena - Kurt Elling,0.295515,-1.495169,0.166667,-2.053186,-0.298407,1.457419,-0.00239,-0.145329,-0.031185,0.046532


# PCA 
Applying a **Principal Component Analysis** to reduce data to it's most important features<br>using PCA() from sklearn.decomposition

In [15]:
# Initialise the PCA object
myPca = PCA(n_components = 0.95)

# Fit/Transform the PCA object to the data
songs_df_pca = myPca.fit_transform(songs_df_scaled)

songs_df_pca.head()

Unnamed: 0_level_0,pca0,pca1,pca2,pca3,pca4,pca5,pca6
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Se Eu Quiser Falar Com Deus - Gilberto Gil,0.047937,0.094044,5.429036,1.453643,0.462327,0.921797,-0.43982
Saudade De Bahia - Antônio Carlos Jobim,-0.838632,-0.691908,-0.533856,0.641386,0.510957,-0.497327,0.296778
"Canta Canta, Minha Gente - Martinho Da Vila",2.158718,3.457972,-1.044947,1.998786,0.176691,-0.595117,0.549465
Mulher Eu Sei - Chico César,-1.723684,0.187734,1.567998,1.490205,0.607838,-0.163217,-0.194227
Rosa Morena - Kurt Elling,-2.303099,0.247676,-0.240701,0.874438,0.78154,0.216339,-0.259728


# Accessing optimal K

## Inertia Method
* Calculation inertia scores 
* By searching for a elbow I assess the k with minimized euclidean distances


In [18]:
max_k = 50
seed = 123
# using my inertia_plot function
inertia_plot(songs_df_pca, max_k, seed)

* The elbow is found at k=6 or k=7 here<br>this is the mathematical best value for k with minimized euclidiean distances
* For buisness reasons we have been asked not to do playlist larger then 250 songs,<br> which imply k values larger than 20

  **k > 20**

## Silhouette Score
* Calculation of silhouette scores
* By evaluation of local maxima, k-values with most distinguishable<br>clusters can be found

In [21]:
min_k = 10
max_k = 50
seed = 123

# using my silhoutte plot function
silhouette_plot(songs_df_pca,min_k, max_k, seed)

* for buisness reasons a minimum number of 20 playlist is considered (k > 20)
* Here, I would use the local maximum of 36, as it promises well distinguisable clusters

In [23]:
k = 36 

# Clustering
* apply kmeans clustering algorithm with optimized k to the dataset and extract playlist numbers (row = table)

In [25]:
# # visualize clusters for dimensions
# random_seed = 123

# print('WitPCA')
# two_dimension_exploration(songs_df_pca, 'pca0', 'pca1', k, random_seed)

In [26]:
# cluster all dimensions
songs_df_cluster = clustering_n_dim(songs_df_pca, k, 123)

In [27]:
songs_df_cluster.head()

Unnamed: 0_level_0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,table
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Se Eu Quiser Falar Com Deus - Gilberto Gil,0.047937,0.094044,5.429036,1.453643,0.462327,0.921797,-0.43982,7
Saudade De Bahia - Antônio Carlos Jobim,-0.838632,-0.691908,-0.533856,0.641386,0.510957,-0.497327,0.296778,6
"Canta Canta, Minha Gente - Martinho Da Vila",2.158718,3.457972,-1.044947,1.998786,0.176691,-0.595117,0.549465,17
Mulher Eu Sei - Chico César,-1.723684,0.187734,1.567998,1.490205,0.607838,-0.163217,-0.194227,10
Rosa Morena - Kurt Elling,-2.303099,0.247676,-0.240701,0.874438,0.78154,0.216339,-0.259728,10


# Assessing the playlists
* restore playlist numbers to original metrics
* calculate metric means to detect any playlist specifications 

In [29]:
# concatenation playlist numbers to df with orginal metrics
songs_df_assessing =  pd.concat([songs_df, songs_df_cluster['table']], axis=1)
# group by playlists and calculate means
songs_df_assessing.groupby('table').mean()

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
table,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.306838,0.801081,3.222222,-12.25601,0.062621,0.040096,0.790871,0.212867,0.303801,119.317859
1,0.364879,0.888121,5.034483,-5.673328,0.262069,0.045473,0.210917,0.153319,0.250362,152.805414
2,0.770303,0.703764,5.654545,-6.1256,0.131085,0.184679,0.010494,0.098243,0.678685,99.463
3,0.273314,0.919407,5.071429,-7.306979,0.090265,0.011589,0.586824,0.350521,0.229825,104.657736
4,0.346793,0.208268,7.742647,-16.115309,0.042737,0.893154,0.495722,0.171338,0.190448,91.636397
5,0.648132,0.77742,7.8107,-5.871494,0.050958,0.155629,0.050828,0.344374,0.589329,120.031362
6,0.691627,0.544524,1.679245,-10.021566,0.042898,0.392017,0.058963,0.1097,0.727151,116.734901
7,0.432981,0.211148,5.837838,-18.467757,0.054916,0.864135,0.301374,0.716649,0.3071,103.909108
8,0.545858,0.7085,1.656716,-7.256918,0.057622,0.225055,0.065661,0.123104,0.691903,165.558075
9,0.732239,0.777627,2.070588,-5.288494,0.052355,0.127196,0.010163,0.110406,0.74249,111.089267


In [30]:
# calculating standart deviations
songs_df_assessing.groupby('table').std()

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
table,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.09903,0.138496,2.405587,2.679612,0.021062,0.153396,0.194522,0.077267,0.173549,17.47593
1,0.179672,0.131987,3.548983,2.116678,0.036613,0.118251,0.342468,0.084078,0.229818,27.035644
2,0.106105,0.134626,3.341634,2.236386,0.029095,0.194553,0.069061,0.037866,0.177983,13.317871
3,0.095269,0.071983,3.389877,2.627414,0.023083,0.044978,0.348009,0.071345,0.124481,15.184966
4,0.150679,0.120982,2.497395,3.040062,0.013836,0.145931,0.409985,0.095656,0.150516,16.031976
5,0.119919,0.135994,1.903989,2.331496,0.018748,0.206806,0.161365,0.066962,0.222586,17.628847
6,0.109317,0.116564,1.638096,2.267709,0.016068,0.255991,0.174478,0.045316,0.190369,14.201107
7,0.18019,0.127593,3.484199,5.205153,0.02798,0.105629,0.383576,0.120071,0.219744,25.947289
8,0.141422,0.167333,1.677379,2.570025,0.023743,0.266081,0.187769,0.060128,0.195816,14.683029
9,0.100519,0.103168,1.827965,1.533548,0.018966,0.141728,0.052163,0.05146,0.152926,15.664757


In [31]:
# concatenate oringal metrices, playlist numbers and spotify html to find songs in spotify 
songs_df_final = pd.concat([songs_df, songs_df_cluster['table'], remaining], axis=1)
songs_df_final = songs_df_final.sort_values(by='table')
songs_df_final.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,table,id,html
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Miscarriage - Pungent Stench / Disharmonic Orchestra Split - Pungent Stench,0.34,0.866,1,-9.768,0.0633,2e-06,0.937,0.276,0.315,89.628,0,75DT7DlrjKbl86Fz48RVKw,https://open.spotify.com/track/75DT7DlrjKbl86Fz48RVKw
Dells of Pain - Blasphereion,0.275,0.746,7,-15.441,0.0668,4.7e-05,0.956,0.361,0.177,132.383,0,3TInHinySztLwZ1Y5Nw70B,https://open.spotify.com/track/3TInHinySztLwZ1Y5Nw70B
Death Evocation - Carnage,0.25,0.713,6,-14.461,0.0371,9.1e-05,0.961,0.299,0.323,115.429,0,16qW2MNMkXVEjaHphQWqE7,https://open.spotify.com/track/16qW2MNMkXVEjaHphQWqE7
Demon's Blood - Blasphereion,0.319,0.701,9,-15.786,0.0697,0.000165,0.933,0.221,0.106,130.517,0,6cZkX4s9lpL8XHwlzaEJmf,https://open.spotify.com/track/6cZkX4s9lpL8XHwlzaEJmf
Blasphemies of the Flesh - Carnage,0.24,0.596,7,-16.03,0.0511,9.4e-05,0.805,0.216,0.331,117.602,0,4YQ3OHjzcUDWZFVaVffZDt,https://open.spotify.com/track/4YQ3OHjzcUDWZFVaVffZDt


In [32]:
# Assesing how the Kmeans algorithm works for automated playlist creation
## Get samples of playlists

In [33]:
# get 20 random songs of one playlist
no = 0 # number of the playlist
songs_df_final.loc[songs_df_final['table'] == no, :].sample(20, random_state=123)

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,table,id,html
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Beginning of Sorrow - Suffocation,0.345,0.782,9,-15.644,0.069,0.0151,0.899,0.178,0.518,110.465,0,5YnVOFa6Gg9N34tBpgimsK,https://open.spotify.com/track/5YnVOFa6Gg9N34tBpgimsK
Stench of Prophet - Brutal Truth,0.194,0.964,6,-14.119,0.0658,1e-06,0.942,0.0747,0.063,133.358,0,6b2pUMt6tQVEI2JThxeZfY,https://open.spotify.com/track/6b2pUMt6tQVEI2JThxeZfY
Heartwork - Carcass,0.201,0.955,4,-13.79,0.0574,1e-06,0.328,0.307,0.296,91.808,0,3wNILeoQCtHguNEam5le75,https://open.spotify.com/track/3wNILeoQCtHguNEam5le75
Baal Reginon - Therion,0.266,0.689,0,-6.302,0.0331,1.8e-05,0.803,0.181,0.477,113.228,0,4CtGvjULcucdLKNsaA0Dfr,https://open.spotify.com/track/4CtGvjULcucdLKNsaA0Dfr
Mirall - Les Sueques,0.485,0.527,2,-8.806,0.0257,0.367,0.85,0.149,0.364,96.527,0,5mxzLPrYmE7IdyEQAqZNLF,https://open.spotify.com/track/5mxzLPrYmE7IdyEQAqZNLF
Miscarriage - Pungent Stench / Disharmonic Orchestra Split - Pungent Stench,0.34,0.866,1,-9.768,0.0633,2e-06,0.937,0.276,0.315,89.628,0,75DT7DlrjKbl86Fz48RVKw,https://open.spotify.com/track/75DT7DlrjKbl86Fz48RVKw
Sweet Lobotomy - Avulsed,0.211,0.939,0,-11.018,0.0539,3.2e-05,0.927,0.314,0.301,146.439,0,22yWu8dBJ6pExcW1120b0P,https://open.spotify.com/track/22yWu8dBJ6pExcW1120b0P
With Or Without You - Remastered - U2,0.54,0.429,2,-11.822,0.0285,0.000202,0.355,0.141,0.113,110.171,0,6ADSaE87h8Y3lccZlBJdXH,https://open.spotify.com/track/6ADSaE87h8Y3lccZlBJdXH
Manic - Monstrosity,0.31,0.903,1,-11.35,0.0512,2e-05,0.857,0.141,0.524,103.36,0,6Y41eEZ7w6FzOyFhtbmwxl,https://open.spotify.com/track/6Y41eEZ7w6FzOyFhtbmwxl
Blasphemies of the Flesh - Carnage,0.24,0.596,7,-16.03,0.0511,9.4e-05,0.805,0.216,0.331,117.602,0,4YQ3OHjzcUDWZFVaVffZDt,https://open.spotify.com/track/4YQ3OHjzcUDWZFVaVffZDt


In [34]:
# get string of all samples' htmls to directly insert this string into spotify, which automatically creates a playlist 
no = 0
test = str(list(songs_df_final.loc[songs_df_final['table'] == no, 'html']
     .sample(20, random_state=123))).replace('[','').replace(' ','').replace(']','').replace('\'','').replace(',','\n')
print(test) # now you can copy past this into a spotify playlist 

https://open.spotify.com/track/5YnVOFa6Gg9N34tBpgimsK
https://open.spotify.com/track/6b2pUMt6tQVEI2JThxeZfY
https://open.spotify.com/track/3wNILeoQCtHguNEam5le75
https://open.spotify.com/track/4CtGvjULcucdLKNsaA0Dfr
https://open.spotify.com/track/5mxzLPrYmE7IdyEQAqZNLF
https://open.spotify.com/track/75DT7DlrjKbl86Fz48RVKw
https://open.spotify.com/track/22yWu8dBJ6pExcW1120b0P
https://open.spotify.com/track/6ADSaE87h8Y3lccZlBJdXH
https://open.spotify.com/track/6Y41eEZ7w6FzOyFhtbmwxl
https://open.spotify.com/track/4YQ3OHjzcUDWZFVaVffZDt
https://open.spotify.com/track/6XdCeut3LzIYBVij7YPglW
https://open.spotify.com/track/4w057gecOTjwAwiRYnlKB0
https://open.spotify.com/track/2fSYadDsbWMtIVqX7flKIc
https://open.spotify.com/track/5dnWuNRDarJhJEFabsbg8u
https://open.spotify.com/track/1Q2cltswPY0iojjk0YtN5n
https://open.spotify.com/track/6lAV4LlATkNy2K2gZYqSa3
https://open.spotify.com/track/0rp143yMMm90tRJ5Ihsh4B
https://open.spotify.com/track/33OpaNKNDVWSvyxc0q8Sgz
https://open.spotify.com/tra

**Assessing the playlist creation**
* For extreme genres (like metal or classical music) the playlist creation already works well
* Many playlist are not capturing the tone perceived by humans
* As first step, the assessment of the metrics themselves is proposed  

## Assment of the metrics: Eminem example
* Compare speechiness and instrumentalness
* check metrics of three similar rap songs

In [37]:
# find all eminem songs
eminem = songs_df_final.loc[songs_df_final.index.str.contains('Eminem', case=False, regex=True) ,:]
eminem

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,table,id,html
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Lose Yourself - From ""8 Mile"" Soundtrack - Eminem",0.701,0.728,2,-4.554,0.255,0.00971,0.00115,0.361,0.0591,171.388,1,7w9bgPAmPTtrkt2v16QWvQ,https://open.spotify.com/track/7w9bgPAmPTtrkt2v16QWvQ
Shake That - Eminem,0.964,0.642,1,-5.831,0.109,0.0543,3.7e-05,0.158,0.547,107.006,2,6KqKg8IPuvtDB3PNAvffFf,https://open.spotify.com/track/6KqKg8IPuvtDB3PNAvffFf
You Don't Know - Eminem,0.767,0.864,10,-1.847,0.133,0.0116,0.0,0.0957,0.747,85.475,2,0nKZeSrqH9u31NLoTUsYBR,https://open.spotify.com/track/0nKZeSrqH9u31NLoTUsYBR
Without Me - Eminem,0.919,0.657,7,-2.823,0.0907,0.00293,0.0,0.356,0.659,112.23,5,5pBvNeOAJ54zgd5lEOmM3b,https://open.spotify.com/track/5pBvNeOAJ54zgd5lEOmM3b
The Real Slim Shady - Eminem,0.949,0.661,5,-4.244,0.0572,0.0302,0.0,0.0454,0.76,104.504,9,3yfqSUWxFvZELEM4PmlwIR,https://open.spotify.com/track/3yfqSUWxFvZELEM4PmlwIR
The Real Slim Shady - Eminem,0.948,0.656,5,-4.34,0.0606,0.0287,0.0,0.0472,0.77,104.495,9,7KccdUP4IslFXUNNseqBc7,https://open.spotify.com/track/7KccdUP4IslFXUNNseqBc7
The Real Slim Shady - Eminem,0.949,0.661,5,-4.244,0.0572,0.0302,0.0,0.0454,0.76,104.504,9,3yfqSUWxFvZELEM4PmlwIR,https://open.spotify.com/track/3yfqSUWxFvZELEM4PmlwIR
My Name Is - Eminem,0.833,0.681,5,-6.248,0.356,0.0406,0.0,0.0926,0.839,85.497,17,0l08dcPEqNEUhymVBext8h,https://open.spotify.com/track/0l08dcPEqNEUhymVBext8h
Mockingbird - Eminem,0.637,0.678,0,-3.798,0.266,0.209,0.0,0.156,0.254,84.039,17,4Tjg4jsELqr8cSgwDZ4twe,https://open.spotify.com/track/4Tjg4jsELqr8cSgwDZ4twe
Crack A Bottle - Eminem,0.516,0.874,9,-2.571,0.186,0.0864,0.0,0.173,0.391,169.561,24,4dK00wCxlqWEeN8BoM1BHT,https://open.spotify.com/track/4dK00wCxlqWEeN8BoM1BHT


In [38]:
# get html string for eminem songs
eminem_html = str(list(songs_df_final.loc[songs_df_final.index.str.contains('Eminem', case=False, regex=True) ,'html']
                       )).replace('[','').replace(' ','').replace(']','').replace('\'','').replace(',','\n')

print(eminem_html)

https://open.spotify.com/track/7w9bgPAmPTtrkt2v16QWvQ
https://open.spotify.com/track/6KqKg8IPuvtDB3PNAvffFf
https://open.spotify.com/track/0nKZeSrqH9u31NLoTUsYBR
https://open.spotify.com/track/5pBvNeOAJ54zgd5lEOmM3b
https://open.spotify.com/track/3yfqSUWxFvZELEM4PmlwIR
https://open.spotify.com/track/7KccdUP4IslFXUNNseqBc7
https://open.spotify.com/track/3yfqSUWxFvZELEM4PmlwIR
https://open.spotify.com/track/0l08dcPEqNEUhymVBext8h
https://open.spotify.com/track/4Tjg4jsELqr8cSgwDZ4twe
https://open.spotify.com/track/4dK00wCxlqWEeN8BoM1BHT
https://open.spotify.com/track/4xkOaSrkexMciUUogZKVTS
https://open.spotify.com/track/1ne7JVHEPnoncXzQunQVKW
https://open.spotify.com/track/2gFUtFej24YGR3PzJUJzKb


In [39]:
# Compare speechiness and instrumentalness of songs
eminem.iloc[[0,1,2,3,4,7,8,9,10,11],[4,6]]

Unnamed: 0_level_0,speechiness,instrumentalness
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1
"Lose Yourself - From ""8 Mile"" Soundtrack - Eminem",0.255,0.00115
Shake That - Eminem,0.109,3.7e-05
You Don't Know - Eminem,0.133,0.0
Without Me - Eminem,0.0907,0.0
The Real Slim Shady - Eminem,0.0572,0.0
My Name Is - Eminem,0.356,0.0
Mockingbird - Eminem,0.266,0.0
Crack A Bottle - Eminem,0.186,0.0
'Till I Collapse - Eminem,0.186,0.0
W.T.P. - Eminem,0.247,0.0


**Speechiness**
* The speechiness describes the presence of spoken words
* for rap songs the values should be between 0.33 and 0.66
* although these songs clearly are considered raps songs the speechiness is too low

**Instrumentalness**
* The instrumentalness increases a less text is in the song
* this metric describes the eminem songs well

In [41]:
# pick three songs which I perceive as similiar
eminem.iloc[[3,4,7,],:-2]

Unnamed: 0_level_0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,table
song/artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Without Me - Eminem,0.919,0.657,7,-2.823,0.0907,0.00293,0.0,0.356,0.659,112.23,5
The Real Slim Shady - Eminem,0.949,0.661,5,-4.244,0.0572,0.0302,0.0,0.0454,0.76,104.504,9
My Name Is - Eminem,0.833,0.681,5,-6.248,0.356,0.0406,0.0,0.0926,0.839,85.497,17


* these examplary songs have similar values for most metrics, which implies that in some cases the metrics caputure the tone
  of songs perceived by humans 

# Conclusion
* As a first draft, using the k-means algorithm yields some good playlists, especially for extreme genres like classical or metal music.
* Nevertheless, some metrics show inconsistencies, resulting in numerous playlists not capturing the right tone of songs.
* Either some metrics themselves have to be reconsidered, as shown with the Eminem example or some metrics have to be dropped.
* Due to this metric issue, at this point, it cannot be evaluated if the k-means algorithm works well for automated playlist creation
  (Is it the metric itself or the algorithm causing strange playlists?).
* On the other hand, it is shown that the tone of songs can be captured, so further investigation is plausible.