In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import json
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pickle

# Imports
from sklearn.cluster import KMeans
from sklearn import metrics 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing

In [6]:
# path to playlist file
root_path = !pwd
root_path = str(root_path[0])
# filename 
filepath = f"/home/ernek/Main/Erdos/song_recommender/playlist_data/sampledata/"
filename = 'mpd.slice.0-999.json'
# path + filename
fpath_name = f"{filepath}{filename}"
# print(fpath_name)

In [7]:
# Open file of playlist and obtain fields of json file 
with open(fpath_name) as data_file:    
    data = json.load(data_file)  

In [8]:
# # Explore fields in json file
# print("File keys: ", list(data.keys()))
# print("info: ",data[list(data.keys())[0]])
# #print(f"{list(data.keys())[1]} : ",data[list(data.keys())[1]])
# print("playlists: ",list(data['playlists'][0].keys()))
# print("tracks: ", list(data['playlists'][0]['tracks'][0].keys()))

In [9]:
# Extract track features
num_keys_old = 0
for index, playlist in enumerate(data['playlists']):
    num_keys =  len(playlist.keys())
    if index == 0:
        num_key_old = num_keys
        continue
    if num_keys > num_key_old:
        keys = playlist.keys()
    num_key_old = num_keys

In [10]:
# Construct keys of playlist
keys = list(keys)
keys.remove('tracks')
print(" Playlist keys: ", keys)

 Playlist keys:  ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 'num_followers', 'num_edits', 'duration_ms', 'num_artists', 'description']


In [11]:
# Create dataframe with track and playlist info
music_df = pd.json_normalize(data['playlists'],  meta = keys, meta_prefix = 'playlist_', errors='ignore', record_path=['tracks'], record_prefix = 'track_')
music_df.head()

Unnamed: 0,track_pos,track_artist_name,track_track_uri,track_artist_uri,track_track_name,track_album_uri,track_duration_ms,track_album_name,playlist_name,playlist_collaborative,playlist_pid,playlist_modified_at,playlist_num_tracks,playlist_num_albums,playlist_num_followers,playlist_num_edits,playlist_duration_ms,playlist_num_artists,playlist_description
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,Throwbacks,False,0,1493424000,52,47,1,6,11532414,37,


In [12]:
# Check column values 
music_df[music_df['playlist_pid'] == 0].columns

Index(['track_pos', 'track_artist_name', 'track_track_uri', 'track_artist_uri',
       'track_track_name', 'track_album_uri', 'track_duration_ms',
       'track_album_name', 'playlist_name', 'playlist_collaborative',
       'playlist_pid', 'playlist_modified_at', 'playlist_num_tracks',
       'playlist_num_albums', 'playlist_num_followers', 'playlist_num_edits',
       'playlist_duration_ms', 'playlist_num_artists', 'playlist_description'],
      dtype='object')

In [13]:
subset_df = music_df

In [14]:
path = '/home/ernek/Main/Erdos/song_recommender/song_data/'
def get_song_dataframe(path):
    # returns a dataframe of all of the songs in the million playlist data set
    # path - a string containing a path to the folder where the csv files are held.
    #        for example, on my machine, the folder where I ran this script also contained a folder
    #        called "song_data" which had the csv files in it. So I called get_song_dataframe('song_data/')
    #        NOTE - Make sure to include the slash!
    file_name_list = ['song.slice.' + str(i) + '-' + str(i + 49999) + '.csv' for i in range(0,2212292, 50000)] 
    file_name_list = file_name_list + ['song.slice.2250000-2262292.csv']
                        
    df_list = []
    for file_name in file_name_list:
        df_list.append(pd.read_csv(path + file_name))
    
    data = pd.concat(df_list)
    return data

In [15]:
data = get_song_dataframe(path)
data.head()

Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,num_playlist_appearances,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,uri,time_signature
0,0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226864,The Cookbook,6840,...,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,audio_features,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,4.0
1,1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,13204,...,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,audio_features,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,4.0
2,2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),16913,...,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259,audio_features,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,4.0
3,3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267267,Justified,9065,...,0.0,0.141,0.201,0.000234,0.0521,0.817,100.972,audio_features,spotify:track:1AWQoqb9bSvzTjaLralEkT,4.0
4,4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,23172,...,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759,audio_features,spotify:track:1lzr43nnXAijIGYnCT8M8H,4.0


In [16]:
data.columns

Index(['Unnamed: 0', 'pos', 'artist_name', 'track_uri', 'artist_uri',
       'track_name', 'album_uri', 'duration_ms', 'album_name',
       'num_playlist_appearances', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'uri', 'time_signature'],
      dtype='object')

In [17]:
data.drop(['Unnamed: 0','album_uri', 'duration_ms', 'album_name', 'num_playlist_appearances', 'time_signature'],axis=1, inplace=True)

In [18]:
data.drop('type', axis=1, inplace=True)

In [19]:
data.head()

Unnamed: 0.1,Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.141,0.201,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [20]:
data = data[~data.isna().any(axis=1)]

In [30]:
data.columns

Index(['Unnamed: 0', 'pos', 'artist_name', 'track_uri', 'artist_uri',
       'track_name', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'uri'],
      dtype='object')

In [21]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo']

In [22]:
song_df = data[song_features]
X_train = np.array(song_df)

In [23]:
len(song_df)

2258242

In [24]:
X_train[:5]

array([[ 9.04000e-01,  8.13000e-01,  4.00000e+00, -7.10500e+00,
         1.21000e-01,  3.11000e-02,  6.97000e-03,  4.71000e-02,
         8.10000e-01,  1.25461e+02],
       [ 7.74000e-01,  8.38000e-01,  5.00000e+00, -3.91400e+00,
         1.14000e-01,  2.49000e-02,  2.50000e-02,  2.42000e-01,
         9.24000e-01,  1.43040e+02],
       [ 6.64000e-01,  7.58000e-01,  2.00000e+00, -6.58300e+00,
         2.10000e-01,  2.38000e-03,  0.00000e+00,  5.98000e-02,
         7.01000e-01,  9.92590e+01],
       [ 8.92000e-01,  7.14000e-01,  4.00000e+00, -6.05500e+00,
         1.41000e-01,  2.01000e-01,  2.34000e-04,  5.21000e-02,
         8.17000e-01,  1.00972e+02],
       [ 8.53000e-01,  6.06000e-01,  0.00000e+00, -4.59600e+00,
         7.13000e-02,  5.61000e-02,  0.00000e+00,  3.13000e-01,
         6.54000e-01,  9.47590e+01]])

In [35]:
%%timeit -r 1 -n 1
import time 

1.5 µs ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [36]:
%%timeit -r 1 -n 1
# Fitting the SVD MATRIX of text vectors without song_features
start = time
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
model.fit(preprocessing.normalize(X_train))
#print(model.labels_)
#print(len(model.labels_))

# for index, value in enumerate(model.labels_):
#     print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
#     if index == 5:
#         break
# print('Top terms per cluster')
# order_centroids = model.cluster_centers_.argsort()[:,::-1]
# terms = vectorizer.get_feature_names_out()
# #print(terms)
# for i in range(true_k):
#     print(f"Cluster {i}: ", end="")
#     for ind in order_centroids[i,:5]:
#         print(f"{terms[ind]} ", end="")
#     print()

3min 4s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [37]:
len(model.labels_)

2258242

# Using only 60% of the songs to create the model

In [77]:
data.reset_index(inplace=True)

ValueError: cannot insert level_0, already exists

In [84]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [85]:
data

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.1210,0.03110,0.006970,0.0471,0.8100,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.1140,0.02490,0.025000,0.2420,0.9240,143.040,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.2100,0.00238,0.000000,0.0598,0.7010,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.1410,0.20100,0.000234,0.0521,0.8170,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.05610,0.000000,0.3130,0.6540,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2258237,55,Fearless Motivation Instrumentals,spotify:track:0kIW60zpvXEysrxYhokd6I,spotify:artist:6cCaSjvFcRWSX1UahLtmdI,Failure (Epic Instrumental),0.412,0.291,6.0,-13.045,1.0,0.0359,0.73500,0.920000,0.1020,0.0378,119.822,spotify:track:0kIW60zpvXEysrxYhokd6I
2258238,56,Fearless Motivation Instrumentals,spotify:track:0JfAJafxPQh7fMPcoa0JEC,spotify:artist:6cCaSjvFcRWSX1UahLtmdI,Let Life Come to You (Instrumental),0.361,0.175,9.0,-21.917,0.0,0.0740,0.98900,0.859000,0.0750,0.3270,143.992,spotify:track:0JfAJafxPQh7fMPcoa0JEC
2258239,23,KLB,spotify:track:7j12SnqI04qCEao2nuwxX6,spotify:artist:6Jrploq7vACkXZg2ERVFK3,A Cada Dez Palavras,0.646,0.614,0.0,-7.908,1.0,0.0347,0.27300,0.000000,0.1020,0.8640,97.018,spotify:track:7j12SnqI04qCEao2nuwxX6
2258240,34,Raça Negra,spotify:track:5ZNUXxjQHzipyvWu7mN8Pn,spotify:artist:1RnHJ07H3jcpay9PrUPjnt,Dou A Vida Por Um Beijo - Ao Vivo Em Fortaleza...,0.468,0.897,7.0,-8.527,1.0,0.0637,0.39700,0.000007,0.9510,0.7940,100.577,spotify:track:5ZNUXxjQHzipyvWu7mN8Pn


In [86]:
percentage = 60
cut = int(len(data)*percentage/100)
print(cut)
training_data = data.iloc[0:cut]

1354945


In [87]:
training_data.head()

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.141,0.201,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [89]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo']
training_song_df = training_data[song_features]
X_train_song = np.array(training_song_df)
X_train_song

array([[9.04000e-01, 8.13000e-01, 4.00000e+00, ..., 4.71000e-02,
        8.10000e-01, 1.25461e+02],
       [7.74000e-01, 8.38000e-01, 5.00000e+00, ..., 2.42000e-01,
        9.24000e-01, 1.43040e+02],
       [6.64000e-01, 7.58000e-01, 2.00000e+00, ..., 5.98000e-02,
        7.01000e-01, 9.92590e+01],
       ...,
       [3.54000e-01, 6.49000e-01, 4.00000e+00, ..., 4.32000e-01,
        3.24000e-01, 1.22440e+02],
       [3.46000e-01, 2.88000e-01, 7.00000e+00, ..., 1.05000e-01,
        7.52000e-01, 9.16770e+01],
       [3.78000e-01, 8.32000e-01, 1.00000e+01, ..., 8.73000e-02,
        4.92000e-01, 8.39330e+01]])

In [143]:
training_data.head()

Unnamed: 0,pos,artist_name,track_uri,artist_uri,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4.0,-7.105,0.0,0.121,0.0311,0.00697,0.0471,0.81,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,0.774,0.838,5.0,-3.914,0.0,0.114,0.0249,0.025,0.242,0.924,143.04,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,0.664,0.758,2.0,-6.583,0.0,0.21,0.00238,0.0,0.0598,0.701,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,0.892,0.714,4.0,-6.055,0.0,0.141,0.201,0.000234,0.0521,0.817,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,0.853,0.606,0.0,-4.596,1.0,0.0713,0.0561,0.0,0.313,0.654,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [104]:
training_data.describe()

Unnamed: 0,pos,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0,1354945.0
mean,59.86092,0.5540723,0.5949331,5.263755,-9.329242,0.6540088,0.08901318,0.3382571,0.2024683,0.2086518,0.4748901,120.2727
std,51.03216,0.1821776,0.2614433,3.561811,5.427323,0.4756905,0.1120471,0.3470865,0.3383227,0.1888746,0.2669975,29.81131
min,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.433,0.407,2.0,-11.457,0.0,0.0353,0.0207,0.0,0.0958,0.251,97.04
50%,45.0,0.568,0.633,5.0,-7.909,1.0,0.0471,0.193,0.000348,0.128,0.464,120.018
75%,88.0,0.69,0.814,8.0,-5.666,1.0,0.0852,0.652,0.29,0.263,0.692,138.57
max,249.0,0.995,1.0,11.0,4.923,1.0,0.969,0.996,1.0,1.0,1.0,249.915


In [108]:
check = X_train_song[:,0].reshape(-1,1)
print(check)
preprocessing.normalize(X_train_song[0:10], axis=0)

[[0.904]
 [0.774]
 [0.664]
 ...
 [0.354]
 [0.346]
 [0.378]]


array([[ 0.3703965 ,  0.33830383,  0.34426519, -0.4148599 ,  0.25511781,
         0.06506596,  0.26849057,  0.08533737,  0.34412463,  0.32017151],
       [ 0.31713152,  0.34870678,  0.43033148, -0.22853788,  0.24035893,
         0.05209461,  0.96302212,  0.43846379,  0.39255698,  0.36503242],
       [ 0.27206115,  0.31541735,  0.17213259, -0.3843804 ,  0.44276645,
         0.00497932,  0.        ,  0.10834766,  0.2978165 ,  0.25330504],
       [ 0.36547974,  0.29710816,  0.34426519, -0.35355056,  0.29728604,
         0.42052277,  0.00901389,  0.09439654,  0.34709854,  0.25767655],
       [ 0.34950024,  0.25216743,  0.        , -0.26835976,  0.15032975,
         0.11736979,  0.        ,  0.567104  ,  0.27784877,  0.24182122],
       [ 0.3609727 ,  0.32790089,  0.17213259, -0.27262222,  0.35421316,
         0.04435365,  0.        ,  0.06830614,  0.25150837,  0.26794819],
       [ 0.27124169,  0.21097176,  0.43033148, -0.4810156 ,  0.24879258,
         0.53768334,  0.        ,  0.08425027

In [158]:
training_data[song_features].drop('danceability', axis=1)

Unnamed: 0,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.813,4.0,-7.105,0.1210,0.03110,0.006970,0.0471,0.810,125.461
1,0.838,5.0,-3.914,0.1140,0.02490,0.025000,0.2420,0.924,143.040
2,0.758,2.0,-6.583,0.2100,0.00238,0.000000,0.0598,0.701,99.259
3,0.714,4.0,-6.055,0.1410,0.20100,0.000234,0.0521,0.817,100.972
4,0.606,0.0,-4.596,0.0713,0.05610,0.000000,0.3130,0.654,94.759
...,...,...,...,...,...,...,...,...,...
1354940,0.789,2.0,-7.392,0.0514,0.45500,0.000000,0.1090,0.868,139.318
1354941,0.590,11.0,-7.511,0.1610,0.04440,0.002570,0.0576,0.314,79.934
1354942,0.649,4.0,-6.150,0.0392,0.00178,0.000000,0.4320,0.324,122.440
1354943,0.288,7.0,-15.474,0.0306,0.78700,0.401000,0.1050,0.752,91.677


In [159]:
# Fitting the SVD MATRIX of text vectors without song_features
true_k = 10

for i in song_features:
    print(i)
    print(training_data[i][0:5])
    X_train_song = np.array(training_data[song_features].drop(i,axis=1))
    print(X_train_song[0:5])
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
    model.fit(X_train_song)
    # save the model as a pickle file
    model_pkl_file = f"model_no_{i}.pkl"  

    with open(model_pkl_file, 'wb') as file:  
        pickle.dump(model, file)
    print(f"model with no {i} feature")


danceability
0    0.904
1    0.774
2    0.664
3    0.892
4    0.853
Name: danceability, dtype: float64
[[ 8.13000e-01  4.00000e+00 -7.10500e+00  1.21000e-01  3.11000e-02
   6.97000e-03  4.71000e-02  8.10000e-01  1.25461e+02]
 [ 8.38000e-01  5.00000e+00 -3.91400e+00  1.14000e-01  2.49000e-02
   2.50000e-02  2.42000e-01  9.24000e-01  1.43040e+02]
 [ 7.58000e-01  2.00000e+00 -6.58300e+00  2.10000e-01  2.38000e-03
   0.00000e+00  5.98000e-02  7.01000e-01  9.92590e+01]
 [ 7.14000e-01  4.00000e+00 -6.05500e+00  1.41000e-01  2.01000e-01
   2.34000e-04  5.21000e-02  8.17000e-01  1.00972e+02]
 [ 6.06000e-01  0.00000e+00 -4.59600e+00  7.13000e-02  5.61000e-02
   0.00000e+00  3.13000e-01  6.54000e-01  9.47590e+01]]
model with no danceability feature
energy
0    0.813
1    0.838
2    0.758
3    0.714
4    0.606
Name: energy, dtype: float64
[[ 9.04000e-01  4.00000e+00 -7.10500e+00  1.21000e-01  3.11000e-02
   6.97000e-03  4.71000e-02  8.10000e-01  1.25461e+02]
 [ 7.74000e-01  5.00000e+00 -3.91400e+

In [161]:
for i in song_features:
    print(i)
    with open(f"model_no_{i}.pkl", 'rb') as file:  
        loaded_model = pickle.load(file)
        print(loaded_model.labels_)
        # evaluate model 
        print(training_data[['artist_name', 'track_name']].iloc[0:5])
        y_predict = loaded_model.predict(np.array(training_data[song_features].drop(i,axis=1).iloc[0:5]))
        print(y_predict)
        file.close()


danceability
[3 0 7 ... 3 7 1]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat Man Scoop)
1     Britney Spears                                       Toxic
2            Beyoncé                               Crazy In Love
3  Justin Timberlake                              Rock Your Body
4             Shaggy                                It Wasn't Me
[3 0 7 7 7]
energy
[2 8 0 ... 2 0 3]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat Man Scoop)
1     Britney Spears                                       Toxic
2            Beyoncé                               Crazy In Love
3  Justin Timberlake                              Rock Your Body
4             Shaggy                                It Wasn't Me
[2 8 0 0 0]
key
[9 3 6 ... 9 6 4]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat

In [136]:
# Fitting the SVD MATRIX of text vectors without song_features
true_k = 10

X_train_song = np.array(training_data[song_features])
print(X_train_song[0:5])
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
model.fit(X_train_song)
# save the model as a pickle file
model_pkl_file = f"model_all.pkl"  
with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)
    print(f"model with all features completed")

[[ 9.04000e-01  8.13000e-01  4.00000e+00 -7.10500e+00  1.21000e-01
   3.11000e-02  6.97000e-03  4.71000e-02  8.10000e-01  1.25461e+02]
 [ 7.74000e-01  8.38000e-01  5.00000e+00 -3.91400e+00  1.14000e-01
   2.49000e-02  2.50000e-02  2.42000e-01  9.24000e-01  1.43040e+02]
 [ 6.64000e-01  7.58000e-01  2.00000e+00 -6.58300e+00  2.10000e-01
   2.38000e-03  0.00000e+00  5.98000e-02  7.01000e-01  9.92590e+01]
 [ 8.92000e-01  7.14000e-01  4.00000e+00 -6.05500e+00  1.41000e-01
   2.01000e-01  2.34000e-04  5.21000e-02  8.17000e-01  1.00972e+02]
 [ 8.53000e-01  6.06000e-01  0.00000e+00 -4.59600e+00  7.13000e-02
   5.61000e-02  0.00000e+00  3.13000e-01  6.54000e-01  9.47590e+01]]
model with all features completed


In [146]:
with open(f"model_all.pkl", 'rb') as file:  
    loaded_model = pickle.load(file)
    print(loaded_model.labels_[0:5])
    print(training_data[['artist_name', 'track_name']].iloc[0:5])
    # evaluate model 
    y_predict = loaded_model.predict(np.array(training_data[song_features].iloc[0:5]))
    print(y_predict)
    file.close()

[7 0 5 5 5]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat Man Scoop)
1     Britney Spears                                       Toxic
2            Beyoncé                               Crazy In Love
3  Justin Timberlake                              Rock Your Body
4             Shaggy                                It Wasn't Me
[7 0 5 5 5]


In [150]:
for i in range(len(song_features)):
    training_data[song_features]

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.904,0.813,4.0,-7.105,0.1210,0.03110,0.006970,0.0471,0.810,125.461
1,0.774,0.838,5.0,-3.914,0.1140,0.02490,0.025000,0.2420,0.924,143.040
2,0.664,0.758,2.0,-6.583,0.2100,0.00238,0.000000,0.0598,0.701,99.259
3,0.892,0.714,4.0,-6.055,0.1410,0.20100,0.000234,0.0521,0.817,100.972
4,0.853,0.606,0.0,-4.596,0.0713,0.05610,0.000000,0.3130,0.654,94.759
...,...,...,...,...,...,...,...,...,...,...
1354940,0.571,0.789,2.0,-7.392,0.0514,0.45500,0.000000,0.1090,0.868,139.318
1354941,0.536,0.590,11.0,-7.511,0.1610,0.04440,0.002570,0.0576,0.314,79.934
1354942,0.354,0.649,4.0,-6.150,0.0392,0.00178,0.000000,0.4320,0.324,122.440
1354943,0.346,0.288,7.0,-15.474,0.0306,0.78700,0.401000,0.1050,0.752,91.677


In [111]:

# Fitting the SVD MATRIX of text vectors without song_features
true_k = 10

for i in song_features:
    print(i)
    print(training_data[i][0:5])
    X_train_song = np.array(training_data[i]).reshape(-1,1)
    print(X_train_song[0:5])
    model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=100, random_state=1)
    model.fit(X_train_song)
    # save the model as a pickle file
    model_pkl_file = f"model_{i}.pkl"  

    with open(model_pkl_file, 'wb') as file:  
        pickle.dump(model, file)
    print(f"model with only {i} feature completed")

#     print(model.labels_)
#     print(len(model.labels_))

# for index, value in enumerate(model.labels_):
#     print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
#     if index == 5:
#         break
# print('Top terms per cluster')
# order_centroids = model.cluster_centers_.argsort()[:,::-1]
# terms = vectorizer.get_feature_names_out()
# #print(terms)
# for i in range(true_k):
#     print(f"Cluster {i}: ", end="")
#     for ind in order_centroids[i,:5]:
#         print(f"{terms[ind]} ", end="")
#     print()

danceability
0    0.904
1    0.774
2    0.664
3    0.892
4    0.853
Name: danceability, dtype: float64
[[0.904]
 [0.774]
 [0.664]
 [0.892]
 [0.853]]
model with only danceability feature completed
energy
0    0.813
1    0.838
2    0.758
3    0.714
4    0.606
Name: energy, dtype: float64
[[0.813]
 [0.838]
 [0.758]
 [0.714]
 [0.606]]
model with only energy feature completed
key
0    4.0
1    5.0
2    2.0
3    4.0
4    0.0
Name: key, dtype: float64
[[4.]
 [5.]
 [2.]
 [4.]
 [0.]]
model with only key feature completed
loudness
0   -7.105
1   -3.914
2   -6.583
3   -6.055
4   -4.596
Name: loudness, dtype: float64
[[-7.105]
 [-3.914]
 [-6.583]
 [-6.055]
 [-4.596]]
model with only loudness feature completed
speechiness
0    0.1210
1    0.1140
2    0.2100
3    0.1410
4    0.0713
Name: speechiness, dtype: float64
[[0.121 ]
 [0.114 ]
 [0.21  ]
 [0.141 ]
 [0.0713]]
model with only speechiness feature completed
acousticness
0    0.03110
1    0.02490
2    0.00238
3    0.20100
4    0.05610
Name: acoust

In [121]:
training_data[song_features[0]][0]

0.904

In [131]:
np.array(training_data['danceability'].iloc[0:2]).reshape(-1,1)

array([[0.904],
       [0.774]])

In [147]:
# load model from pickle file
for i in song_features:
    print(i)
    with open(f"model_{i}.pkl", 'rb') as file:  
        loaded_model = pickle.load(file)
        print(loaded_model.labels_)
        # evaluate model 
        print(training_data[['artist_name', 'track_name']].iloc[0:5])
        y_predict = loaded_model.predict(np.array(training_data[i].iloc[0:5]).reshape(-1,1))
        print(y_predict)
        file.close()


danceability
[7 0 9 ... 4 4 4]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat Man Scoop)
1     Britney Spears                                       Toxic
2            Beyoncé                               Crazy In Love
3  Justin Timberlake                              Rock Your Body
4             Shaggy                                It Wasn't Me
[7 0 9 7 7]
energy
[8 0 8 ... 2 1 0]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat Man Scoop)
1     Britney Spears                                       Toxic
2            Beyoncé                               Crazy In Love
3  Justin Timberlake                              Rock Your Body
4             Shaggy                                It Wasn't Me
[8 0 8 4 2]
key
[0 3 9 ... 0 5 6]
         artist_name                                  track_name
0      Missy Elliott  Lose Control (feat. Ciara & Fat

In [75]:
# Select indexes of playlists that have a description
# playlists_id_with_description = music_df[~music_df['playlist_description'].isna()]['playlist_pid'].unique()
# print(playlists_id_with_description)

In [76]:
# Working only with playlists that contain a "description"
# subset_df = music_df[music_df['playlist_pid'].isin(playlists_id_with_description)]
# print(subset_df['playlist_pid'].unique())

In [77]:
num_records = len(subset_df)
num_unique_records = len(subset_df[['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1).unique())
print('Number of table records: ', num_records)
print('Number of unique artist, song pairs: ', num_unique_records )
print('Number of repeated songs: ', num_records - num_unique_records )
print('Number of playlists: ', len(subset_df['playlist_pid'].unique()))
print('Number of unique artists: ', len(subset_df['track_artist_name'].unique()))

Number of table records:  67503
Number of unique artist, song pairs:  34250
Number of repeated songs:  33253
Number of playlists:  1000
Number of unique artists:  9722


In [78]:
# Function to select track_uri ONLY Nrecords records for now
Nrecords = 250
def get_features(df, Nrecords):
    indexes = []
    audio_features = []
    track_popularity = []
    artist_genre = []
    artist_popularity = []
    album_popularity = []
    start = 0
    for row_index, row in df.iloc[0:Nrecords].iterrows():
        #print(row_index, row['track_track_uri'])
        track_uri = row['track_track_uri']
        artist_uri = row['track_artist_uri']
        album_uri = row['track_album_uri']
        if start == 0:
            keys = spotify.audio_features(tracks=track_uri)[0].keys()
        start += 1
    
        track_popularity.append(spotify.track(track_uri)['popularity'])
        
        indexes.append(row_index) 
        audio_features.append(spotify.audio_features(tracks=track_uri)[0].values())
        
        artist_results = spotify.artist(artist_uri)
        
        artist_genre.append(artist_results['genres'])
        artist_popularity.append(artist_results['popularity'])
        #print(artist_results['genres'], artist_results['popularity'])
        album_results = spotify.album(album_uri)
        album_popularity.append(album_results['popularity'])
        
        
    features_df = pd.DataFrame(audio_features, columns=keys)
    features_df['song_popularity'] = track_popularity
    features_df['artist_genre'] = artist_genre
    features_df['artist_popularity']  = artist_popularity
    features_df['album_popularity'] = album_popularity
    features_df['index'] = indexes
    features_df.set_index('index', inplace=True)
    return features_df

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [79]:
audio_sp_df = get_features(subset_df , Nrecords)
audio_sp_df.index.name = None
audio_sp_df.drop(['type', 'id', 'track_href', 'analysis_url'], inplace=True, axis=1)
audio_sp_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,duration_ms,time_signature,song_popularity,artist_genre,artist_popularity,album_popularity
0,0.904,0.813,4,-7.105,0,0.1210,0.031100,0.006970,0.0471,0.8100,125.461,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,226864,4,69,"[dance pop, hip hop, hip pop, neo soul, pop ra...",72,62
1,0.774,0.838,5,-3.914,0,0.1140,0.024900,0.025000,0.2420,0.9240,143.040,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,198800,4,84,"[dance pop, pop]",80,78
2,0.664,0.758,2,-6.583,0,0.2100,0.002380,0.000000,0.0598,0.7010,99.259,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,235933,4,21,"[pop, r&b]",87,17
3,0.892,0.714,4,-6.055,0,0.1410,0.201000,0.000234,0.0521,0.8170,100.972,spotify:track:1AWQoqb9bSvzTjaLralEkT,267267,4,79,"[dance pop, pop]",79,77
4,0.853,0.606,0,-4.596,1,0.0713,0.056100,0.000000,0.3130,0.6540,94.759,spotify:track:1lzr43nnXAijIGYnCT8M8H,227600,4,4,"[dance pop, pop rap, reggae fusion]",73,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,0.239,0.820,0,-9.410,1,0.0487,0.008470,0.676000,0.3690,0.5470,165.180,spotify:track:1FS4e2J4JeNZKDtcdQpUmn,273800,4,14,"[alternative rock, experimental rock, hardcore...",43,36
246,0.218,0.819,0,-9.783,1,0.0604,0.000025,0.604000,0.3620,0.4140,147.555,spotify:track:77VzCVHQgX613AkssBWK8M,201573,4,15,"[alternative rock, experimental rock, hardcore...",43,36
247,0.386,0.964,7,-11.310,1,0.0760,0.058900,0.948000,0.3620,0.0571,90.257,spotify:track:7lr5a1V6SZuv1RWFSUOOHe,209200,4,24,"[alternative rock, experimental rock, hardcore...",43,46
248,0.503,0.803,0,-9.701,1,0.0485,0.010000,0.064800,0.2920,0.9410,179.007,spotify:track:30cHDhxUqgnHq78hv5UjMx,225187,4,53,"[mod revival, new wave]",39,46


In [80]:
# Parameters to use for the model
audio_sp_df.keys()

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'uri', 'duration_ms', 'time_signature', 'song_popularity',
       'artist_genre', 'artist_popularity', 'album_popularity'],
      dtype='object')

In [81]:
check_df = subset_df.iloc[0:Nrecords].merge(audio_sp_df, how='left' , left_on = 'track_track_uri', right_on='uri')
check_df.drop(['track_pos', 'uri', 'mode', 'playlist_duration_ms','playlist_num_albums','playlist_num_artists',  'track_artist_uri', 'track_album_uri', 'track_duration_ms','playlist_num_followers', 'playlist_num_edits', 'playlist_collaborative', 'playlist_modified_at', 'playlist_num_tracks'], inplace = True, axis=1)
check_df

Unnamed: 0,track_artist_name,track_track_uri,track_track_name,track_album_name,playlist_name,playlist_pid,playlist_description,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,song_popularity,artist_genre,artist_popularity,album_popularity
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),The Cookbook,Throwbacks,0,,0.904,0.813,4,...,0.006970,0.0471,0.8100,125.461,226864,4,69,"[dance pop, hip hop, hip pop, neo soul, pop ra...",72,62
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,In The Zone,Throwbacks,0,,0.774,0.838,5,...,0.025000,0.2420,0.9240,143.040,198800,4,84,"[dance pop, pop]",80,78
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,0,,0.664,0.758,2,...,0.000000,0.0598,0.7010,99.259,235933,4,21,"[pop, r&b]",87,17
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,Justified,Throwbacks,0,,0.892,0.714,4,...,0.000234,0.0521,0.8170,100.972,267267,4,79,"[dance pop, pop]",79,77
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,Hot Shot,Throwbacks,0,,0.853,0.606,0,...,0.000000,0.3130,0.6540,94.759,227600,4,4,"[dance pop, pop rap, reggae fusion]",73,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,Hüsker Dü,spotify:track:1FS4e2J4JeNZKDtcdQpUmn,Turn It Around,Warehouse: Songs And Stories,mat,3,,0.239,0.820,0,...,0.676000,0.3690,0.5470,165.180,273800,4,14,"[alternative rock, experimental rock, hardcore...",43,36
248,Hüsker Dü,spotify:track:77VzCVHQgX613AkssBWK8M,She's A Woman [And Now He Is A Man],Warehouse: Songs And Stories,mat,3,,0.218,0.819,0,...,0.604000,0.3620,0.4140,147.555,201573,4,15,"[alternative rock, experimental rock, hardcore...",43,36
249,Hüsker Dü,spotify:track:7lr5a1V6SZuv1RWFSUOOHe,Crystal,Candy Apple Grey,mat,3,,0.386,0.964,7,...,0.948000,0.3620,0.0571,90.257,209200,4,24,"[alternative rock, experimental rock, hardcore...",43,46
250,The Vapors,spotify:track:30cHDhxUqgnHq78hv5UjMx,Turning Japanese,Turning Japanese - Best Of The Vapors,mat,3,,0.503,0.803,0,...,0.064800,0.2920,0.9410,179.007,225187,4,53,"[mod revival, new wave]",39,46


In [82]:
check_df.describe()

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,song_popularity,artist_popularity,album_popularity
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,0.580277,0.685518,4.90873,-7.179242,0.085897,0.213358,0.112808,0.185913,0.525262,123.365476,234801.984127,3.936508,28.68254,58.253968,29.186508
std,0.170412,0.21356,3.490968,3.991291,0.086673,0.267369,0.263273,0.139082,0.234469,29.634012,74377.828523,0.339816,30.133343,18.832216,30.00546
min,0.0849,0.0134,0.0,-29.917,0.025,1e-05,0.0,0.0268,0.0385,48.973,65306.0,1.0,0.0,0.0,0.0
25%,0.47025,0.57275,2.0,-8.843,0.0378,0.009015,0.0,0.0921,0.353,99.855,200030.75,4.0,0.0,46.0,0.0
50%,0.6075,0.7265,4.0,-6.1525,0.0527,0.08755,4.3e-05,0.132,0.544,122.3935,223541.0,4.0,19.5,60.0,24.5
75%,0.71625,0.848,8.0,-4.4155,0.0864,0.31875,0.01635,0.25,0.71625,143.81675,256732.0,4.0,55.0,71.0,59.0
max,0.938,0.976,11.0,-1.029,0.505,0.989,0.991,0.869,0.97,210.857,658987.0,5.0,85.0,90.0,86.0


In [83]:
# Make album popularity equal to artist popularity if album popularity is 0
check_df['album_popularity'] = np.where(check_df['album_popularity'] == 0, check_df['artist_popularity'], check_df['album_popularity'])
check_df['song_popularity'] = np.where(check_df['song_popularity'] == 0, check_df['artist_popularity'], check_df['song_popularity'])
check_df

Unnamed: 0,track_artist_name,track_track_uri,track_track_name,track_album_name,playlist_name,playlist_pid,playlist_description,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,song_popularity,artist_genre,artist_popularity,album_popularity
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),The Cookbook,Throwbacks,0,,0.904,0.813,4,...,0.006970,0.0471,0.8100,125.461,226864,4,69,"[dance pop, hip hop, hip pop, neo soul, pop ra...",72,62
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,In The Zone,Throwbacks,0,,0.774,0.838,5,...,0.025000,0.2420,0.9240,143.040,198800,4,84,"[dance pop, pop]",80,78
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,0,,0.664,0.758,2,...,0.000000,0.0598,0.7010,99.259,235933,4,21,"[pop, r&b]",87,17
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,Justified,Throwbacks,0,,0.892,0.714,4,...,0.000234,0.0521,0.8170,100.972,267267,4,79,"[dance pop, pop]",79,77
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,Hot Shot,Throwbacks,0,,0.853,0.606,0,...,0.000000,0.3130,0.6540,94.759,227600,4,4,"[dance pop, pop rap, reggae fusion]",73,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,Hüsker Dü,spotify:track:1FS4e2J4JeNZKDtcdQpUmn,Turn It Around,Warehouse: Songs And Stories,mat,3,,0.239,0.820,0,...,0.676000,0.3690,0.5470,165.180,273800,4,14,"[alternative rock, experimental rock, hardcore...",43,36
248,Hüsker Dü,spotify:track:77VzCVHQgX613AkssBWK8M,She's A Woman [And Now He Is A Man],Warehouse: Songs And Stories,mat,3,,0.218,0.819,0,...,0.604000,0.3620,0.4140,147.555,201573,4,15,"[alternative rock, experimental rock, hardcore...",43,36
249,Hüsker Dü,spotify:track:7lr5a1V6SZuv1RWFSUOOHe,Crystal,Candy Apple Grey,mat,3,,0.386,0.964,7,...,0.948000,0.3620,0.0571,90.257,209200,4,24,"[alternative rock, experimental rock, hardcore...",43,46
250,The Vapors,spotify:track:30cHDhxUqgnHq78hv5UjMx,Turning Japanese,Turning Japanese - Best Of The Vapors,mat,3,,0.503,0.803,0,...,0.064800,0.2920,0.9410,179.007,225187,4,53,"[mod revival, new wave]",39,46


In [84]:
check_df.describe()

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,song_popularity,artist_popularity,album_popularity
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,0.580277,0.685518,4.90873,-7.179242,0.085897,0.213358,0.112808,0.185913,0.525262,123.365476,234801.984127,3.936508,53.02381,58.253968,51.496032
std,0.170412,0.21356,3.490968,3.991291,0.086673,0.267369,0.263273,0.139082,0.234469,29.634012,74377.828523,0.339816,21.643973,18.832216,22.303502
min,0.0849,0.0134,0.0,-29.917,0.025,1e-05,0.0,0.0268,0.0385,48.973,65306.0,1.0,1.0,0.0,0.0
25%,0.47025,0.57275,2.0,-8.843,0.0378,0.009015,0.0,0.0921,0.353,99.855,200030.75,4.0,40.0,46.0,40.0
50%,0.6075,0.7265,4.0,-6.1525,0.0527,0.08755,4.3e-05,0.132,0.544,122.3935,223541.0,4.0,55.0,60.0,55.0
75%,0.71625,0.848,8.0,-4.4155,0.0864,0.31875,0.01635,0.25,0.71625,143.81675,256732.0,4.0,68.0,71.0,69.0
max,0.938,0.976,11.0,-1.029,0.505,0.989,0.991,0.869,0.97,210.857,658987.0,5.0,90.0,90.0,90.0


In [85]:
check_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 252 entries, 0 to 251
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   track_artist_name     252 non-null    object 
 1   track_track_uri       252 non-null    object 
 2   track_track_name      252 non-null    object 
 3   track_album_name      252 non-null    object 
 4   playlist_name         252 non-null    object 
 5   playlist_pid          252 non-null    object 
 6   playlist_description  0 non-null      object 
 7   danceability          252 non-null    float64
 8   energy                252 non-null    float64
 9   key                   252 non-null    int64  
 10  loudness              252 non-null    float64
 11  speechiness           252 non-null    float64
 12  acousticness          252 non-null    float64
 13  instrumentalness      252 non-null    float64
 14  liveness              252 non-null    float64
 15  valence               2

In [86]:
# NLP workflow
# Need to vectorize string fields
# Use all string fields and combine them in a column containing all the words f
# from collections import Counter

In [87]:
check_df['artist_genre']

0      [dance pop, hip hop, hip pop, neo soul, pop ra...
1                                       [dance pop, pop]
2                                             [pop, r&b]
3                                       [dance pop, pop]
4                    [dance pop, pop rap, reggae fusion]
                             ...                        
247    [alternative rock, experimental rock, hardcore...
248    [alternative rock, experimental rock, hardcore...
249    [alternative rock, experimental rock, hardcore...
250                              [mod revival, new wave]
251                                                   []
Name: artist_genre, Length: 252, dtype: object

In [88]:
#check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ' '.join([i.replace('-', '_') for i in x]))
check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ' '.join([i.replace('-', '_').replace('_', '') for i in x]))

In [36]:
check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ' '.join(list(set(x.split()))))

In [38]:
# recheck = check.apply(lambda x: list(set(x.split())))
# print(check)
# print(recheck)
# # check_df['artist_genre'] = check_df['artist_genre'].apply(lambda x: ''.join([i.replace('', '_') for i in x]))

In [90]:
label_columns = ['track_track_name', 'artist_genre']
check_df[label_columns]

Unnamed: 0,track_track_name,artist_genre
0,Lose Control (feat. Ciara & Fat Man Scoop),dance pop hip hop hip pop neo soul pop rap r&b...
1,Toxic,dance pop pop
2,Crazy In Love,pop r&b
3,Rock Your Body,dance pop pop
4,It Wasn't Me,dance pop pop rap reggae fusion
...,...,...
247,Turn It Around,alternative rock experimental rock hardcore pu...
248,She's A Woman [And Now He Is A Man],alternative rock experimental rock hardcore pu...
249,Crystal,alternative rock experimental rock hardcore pu...
250,Turning Japanese,mod revival new wave


In [91]:
# Combine 5 fields into one
# string_field = check_df.track_track_name.str.cat(" " + check_df.artist_genre)
string_field = check_df.artist_genre

string_field = string_field.replace({"r\&b": "rhythm blues"}, regex = True)
string_field = string_field.replace({"[^A-Za-z ]+": ""}, regex = True)
#print('last song: ',string_field.tail())
string_field

0      dance pop hip hop hip pop neo soul pop rap rhy...
1                                          dance pop pop
2                                       pop rhythm blues
3                                          dance pop pop
4                        dance pop pop rap reggae fusion
                             ...                        
247    alternative rock experimental rock hardcore pu...
248    alternative rock experimental rock hardcore pu...
249    alternative rock experimental rock hardcore pu...
250                                 mod revival new wave
251                                                     
Name: artist_genre, Length: 252, dtype: object

In [92]:
test_song = string_field[0:int(len(string_field)*0.4)]
print(test_song[0])
#fraction = 0.0
#string_field = string_field[int(len(string_field)*fraction):]


dance pop hip hop hip pop neo soul pop rap rhythm blues rap urban contemporary virginia hip hop


In [93]:
# for i in string_field:
#     print(set(i.split()))

In [94]:
#check_df['All_text'] = check_df.track_track_name.str.cat(" " + check_df.artist_genre)
check_df['All_text'] = check_df.artist_genre
# check_df

In [96]:
# import nltk
# from nltk.stem import WordNetLemmatizer

In [97]:
# stopwords = nltk.corpus.stopwords.words('english')
# lemmatizer = WordNetLemmatizer()
# nltk.download('stopwords')

In [98]:
check_df.keys()

Index(['track_artist_name', 'track_track_uri', 'track_track_name',
       'track_album_name', 'playlist_name', 'playlist_pid',
       'playlist_description', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature', 'song_popularity',
       'artist_genre', 'artist_popularity', 'album_popularity', 'All_text'],
      dtype='object')

In [99]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'song_popularity', 'artist_popularity', 'album_popularity']
df_song_features = check_df[song_features]

In [100]:
song_features_array = np.array(df_song_features)

In [101]:
song_features = ['danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo']
df_song_features = check_df[song_features]

In [102]:
df_song_features

Unnamed: 0,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,0.904,0.813,4,-7.105,0.1210,0.031100,0.006970,0.0471,0.8100,125.461
1,0.774,0.838,5,-3.914,0.1140,0.024900,0.025000,0.2420,0.9240,143.040
2,0.664,0.758,2,-6.583,0.2100,0.002380,0.000000,0.0598,0.7010,99.259
3,0.892,0.714,4,-6.055,0.1410,0.201000,0.000234,0.0521,0.8170,100.972
4,0.853,0.606,0,-4.596,0.0713,0.056100,0.000000,0.3130,0.6540,94.759
...,...,...,...,...,...,...,...,...,...,...
247,0.239,0.820,0,-9.410,0.0487,0.008470,0.676000,0.3690,0.5470,165.180
248,0.218,0.819,0,-9.783,0.0604,0.000025,0.604000,0.3620,0.4140,147.555
249,0.386,0.964,7,-11.310,0.0760,0.058900,0.948000,0.3620,0.0571,90.257
250,0.503,0.803,0,-9.701,0.0485,0.010000,0.064800,0.2920,0.9410,179.007


In [103]:
song_features_array = np.array(df_song_features)

In [134]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(string_field)
#print(X_train[0])
x_train_array = X_train.toarray()

svd = TruncatedSVD(n_components=100)
#print(type(svd))
text_X_train = svd.fit_transform(X_train)
#print(text_X_train)
#print(text_X_train.shape)

#for index, value in enumerate(text_X_train):
#    print(string_field[index], value)

#print(text_X_train)
#print(x_train_array)
print(x_train_array.shape)
print(x_train_array[0][44])
f_xtrain_nosvp = np.concatenate((song_features_array, x_train_array), axis=1)
print('xtrain_nosvp', f_xtrain_nosvp.shape)
f_xtrain = np.concatenate((song_features_array, text_X_train), axis=1)
#print(f_xtrain)
print(f_xtrain.shape)

#print(X_train)
#print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")

(252, 180)
0.0
xtrain_nosvp (252, 190)
(252, 110)


In [135]:
print(f"{X_train.nnz / np.prod(X_train.shape):.3f}")
print(f"About {X_train.nnz / np.prod(X_train.shape)*100:.3f}\% of the entries in the matrix are non zero")

0.025
About 2.478\% of the entries in the matrix are non zero


In [156]:
# Fitting the Full MATRIX of text vectors without song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100, random_state=1)
model.fit(preprocessing.normalize(X_train))
X_train_dist = model.transform(X_train)**2
print(X_train_dist.shape)
print(X_train_dist)
dist_df = pd.DataFrame(X_train_dist.sum(axis=1).round(2), columns=['sqdist'])
print(X_train_dist[0].sum())
print(X_train_dist[0].min())
dist_df['label'] = model.labels_
print(dist_df.head())
print(dist_df.shape)
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:10]:
        print(f"{terms[ind]} ", end="")
    print()

(252, 10)
[[0.89227345 1.27102275 1.91278278 ... 1.80826174 0.80716952 1.47876661]
 [0.96919466 1.42402586 1.91278278 ... 1.67767215 0.28438844 1.31773477]
 [1.01477019 0.52146957 1.91278278 ... 1.77598849 0.85240445 1.50461953]
 ...
 [1.00209608 1.55704328 1.91278278 ... 1.81868342 1.21029214 1.42723335]
 [1.054297   1.60022926 1.91278278 ... 1.87177704 1.44295858 0.78570301]
 [0.05917394 0.60712705 0.91278278 ... 0.87177704 0.44295858 0.53121522]]
14.47126880608181
0.807169521599963
   sqdist  label
0   14.47      8
1   13.38      8
2   13.78      1
3   13.38      8
4   14.60      8
(252, 2)
STRING: dance pop hip hop hip pop neo soul pop rap rhythm blues rap urban contemporary virginia hip hop CLUSTER_INDEX: 8
STRING: dance pop pop CLUSTER_INDEX: 8
STRING: pop rhythm blues CLUSTER_INDEX: 1
STRING: dance pop pop CLUSTER_INDEX: 8
STRING: dance pop pop rap reggae fusion CLUSTER_INDEX: 8
STRING: atl hip hop contemporary rhythm blues dance pop pop rhythm blues rap south carolina hip hop u

In [147]:
# Fitting the Full MATRIX of text vectors + song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100, random_state=1)
model.fit(preprocessing.normalize(f_xtrain_nosvp))
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:10]:
        print(f"{terms[ind]} ", end="")
    print()

#print(list(test_song[:1]))
#print(list(test_song)[0])

# X_test = vectorizer.transform(list(test_song))
# #print(X_test)
# print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")
# # feature_names = vectorizer.get_feature_names_out()
# # print(feature_names)
# prediction = model.predict(X_test)
# print(prediction)

STRING: dance pop hip hop hip pop neo soul pop rap rhythm blues rap urban contemporary virginia hip hop CLUSTER_INDEX: 3
STRING: dance pop pop CLUSTER_INDEX: 3
STRING: pop rhythm blues CLUSTER_INDEX: 8
STRING: dance pop pop CLUSTER_INDEX: 3
STRING: dance pop pop rap reggae fusion CLUSTER_INDEX: 8
STRING: atl hip hop contemporary rhythm blues dance pop pop rhythm blues rap south carolina hip hop urban contemporary CLUSTER_INDEX: 3
Top terms per cluster
Cluster 0: atl alabama ambient america fusion slowcore mod group dreamo singersongwriter 
Cluster 1: atl alabama afrofuturism art aesthetic metal punk area ambient alternative 
Cluster 2: atl alabama ambient aesthetic art afrofuturism america metal kpop area 
Cluster 3: atl alabama afrofuturism art aesthetic punk area metal ambient america 
Cluster 4: atl alabama aesthetic ambient afrofuturism art america shoegaze area candy 
Cluster 5: atl alabama afrofuturism aesthetic art ambient punk metal area mellow 
Cluster 6: atl alabama afrofutur

In [144]:
# Fitting the SVD MATRIX of text vectors without song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100, random_state=1)
model.fit(preprocessing.normalize(text_X_train))
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:5]:
        print(f"{terms[ind]} ", end="")
    print()

STRING: dance pop hip hop hip pop neo soul pop rap rhythm blues rap urban contemporary virginia hip hop CLUSTER_INDEX: 6
STRING: dance pop pop CLUSTER_INDEX: 6
STRING: pop rhythm blues CLUSTER_INDEX: 3
STRING: dance pop pop CLUSTER_INDEX: 6
STRING: dance pop pop rap reggae fusion CLUSTER_INDEX: 6
STRING: atl hip hop contemporary rhythm blues dance pop pop rhythm blues rap south carolina hip hop urban contemporary CLUSTER_INDEX: 6
Top terms per cluster
Cluster 0: alabama afrofuturism album aesthetic band 
Cluster 1: aesthetic atlanta alabama area carolina 
Cluster 2: afrofuturism atl australian aesthetic boy 
Cluster 3: album aesthetic alternative british bedroom 
Cluster 4: america area british bubblegrunge bay 
Cluster 5: ambient afrofuturism area aesthetic art 
Cluster 6: afrofuturism aesthetic blues bounce band 
Cluster 7: alternative afrofuturism aesthetic alabama dirty 
Cluster 8: aesthetic alabama australian brighton art 
Cluster 9: art australian afrofuturism chamber atlanta 


In [143]:
# Fitting the SVD MATRIX of text vectors with song_features
true_k = 10

model = KMeans(n_clusters=true_k, init='k-means++', max_iter=500, n_init=100)
model.fit(preprocessing.normalize(f_xtrain))
#print(model.labels_)
#print(len(model.labels_))

for index, value in enumerate(model.labels_):
    print(f"STRING: {string_field[index]}", f"CLUSTER_INDEX: {value}")
    if index == 5:
        break
    
print('Top terms per cluster')
order_centroids = model.cluster_centers_.argsort()[:,::-1]
terms = vectorizer.get_feature_names_out()
#print(terms)
for i in range(true_k):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i,:5]:
        print(f"{terms[ind]} ", end="")
    print()

STRING: dance pop hip hop hip pop neo soul pop rap rhythm blues rap urban contemporary virginia hip hop CLUSTER_INDEX: 3
STRING: dance pop pop CLUSTER_INDEX: 3
STRING: pop rhythm blues CLUSTER_INDEX: 0
STRING: dance pop pop CLUSTER_INDEX: 3
STRING: dance pop pop rap reggae fusion CLUSTER_INDEX: 0
STRING: atl hip hop contemporary rhythm blues dance pop pop rhythm blues rap south carolina hip hop urban contemporary CLUSTER_INDEX: 3
Top terms per cluster
Cluster 0: atl alabama afrofuturism aesthetic art 
Cluster 1: atl alabama ambient aesthetic art 
Cluster 2: atl alabama afrofuturism aesthetic art 
Cluster 3: atl alabama afrofuturism art aesthetic 
Cluster 4: atl alabama aesthetic ambient afrofuturism 
Cluster 5: atl alabama afrofuturism aesthetic art 
Cluster 6: atl alabama aesthetic ambient afrofuturism 
Cluster 7: atl alabama afrofuturism aesthetic art 
Cluster 8: atl alabama ambient dark america 
Cluster 9: atl alabama afrofuturism art aesthetic 


In [364]:
#print(list(test_song[:1]))
#print(list(test_song)[0])
X_test = vectorizer.transform(list(test_song))
#print(X_test)
print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")
# feature_names = vectorizer.get_feature_names_out()
# print(feature_names)
prediction = model.predict(X_test)
print(prediction)

n_samples: 40, n_features: 92


ValueError: X has 92 features, but KMeans is expecting 14 features as input.

In [59]:
print(model.labels_)
print(prediction)

[4 1 7 7 3 7 7 9 8 1 9 0 7 3 9 7 4 6 7 2 3 0 3 0 0 9 7 8 1 1 2 3 3 6 6 3 0
 0 0 4 9 2 6 6 7 4 1 5 3 3 1 7 3 2 2 6 6 7 0 1 7 8 7 0 2 2 2 6 4 2 9 6 7 7
 0 0 0 4 0 3 4 5 3 7 9 5 4 4 4 3 1 1 4 2 0 1 5 3 7 1 7 4 4 9 4 1 3 3 7 3 9
 3 7 9 7 7 4 4 3 8 7 3 6 6 4 0 6 0 7 5 9 1 3 1 3 9 4 9 4 1 4 7 3 7 1 7 7 0
 6 1 9 1 2 7 1 3 4 4 1 2 0 3 2 2 1 4 1 4 4 7 0 7 4 0 4 4 9 2 4 1 3 1 2 2 0
 0 6 1 2 6 4 3 3 7 0 1 7 6 0 4 6 9 4 0 0 0 3 3 7 2 3 1 7 4 4 6 0 6 8 1 1 3
 6 4 0 0 2 1 8 0 1 9 0 0 4 2 0 4 1 2 2 6 7 6 9 9 9 9 3 6 2 6 3 2 6 9 6 6 0
 3 6 3 7 7 0 0 4 0 1 0 4 3 3 5 6 7 3 3 1 1 0 4 7 4 3 3 6 2 0 7 2 4 3 9 6 0
 2 0 7 2 0 7 7 9 3 3 4 0 1 4 4 4 4 0 4 7 7 0 0 5 5 0 0 4 0 0 0 4 4 4 4 9 0
 0 4 2 7 4 2 4 4 7 0 0 7 9 1 1 4 7 7 7 4 0 1 8 1 4 4 9 1 7 1 7 1 6 2 0 0 0
 4 4 9 4 7 4 7 7 7 0 0 8 4 7 7 4 7 0 7 3 4 0 7 6 4 2 7 0 3 7 7 2 2 7 0 8 7
 9 0 0 4 1 0 3 0 3 7 4 4 6 7 4 0 5 8 7 4 4 4 6 6 0 6 1 3 1 7 9 7 4 1 3 7 3
 3 1 0 1 7 3 6 6 6 2 9 0 2 7 0 7 7 0 4 0 7 7 2 1 6 0 2 6 6 7 9 6 0 1 2 4 0
 0 8 2 2 1 9 7 1 0 6 4 0 

NameError: name 'prediction' is not defined

In [108]:
# # Save machine learning model
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))

In [109]:
# # load model only once
# with open('finalized_model.sav', 'rb') as fid:
#     model = pickle.load(fid)

In [110]:
check_df['ClusterPrediction'] = ""
check_df['All_text']

0      dance pop hip hop hip pop neo soul pop rap r&b...
1                                          dance pop pop
2                                                pop r&b
3                                          dance pop pop
4                        dance pop pop rap reggae fusion
                             ...                        
97                             k pop k pop boy group pop
98                                k pop k pop girl group
99                                                 k pop
100                                                k pop
101                                                k pop
Name: All_text, Length: 102, dtype: object

In [111]:
def cluster_predict(str_input):
    #print(test_song)
    if isinstance(str_input, str):
        # It is a single string element thus we cannot use list() and have to use brackets [] to transform to list
        X_test = vectorizer.transform([str_input])
        prediction = model.predict(X_test)
    else:
        # if is not a single string element so we can transform into a list directly using list()
        X_test = vectorizer.transform(list(str_input))
        prediction = model.predict(X_test)
    return int(prediction)

In [112]:
check_df['All_text'][0]

'dance pop hip hop hip pop neo soul pop rap r&b rap urban contemporary virginia hip hop'

In [113]:
cluster_predict(check_df['All_text'][0])

2

In [114]:
cluster_predict([check_df['All_text'][0]])

2

In [115]:
result = check_df['All_text'].apply(lambda x: cluster_predict(x))
print(result)

0      2
1      2
2      2
3      2
4      2
      ..
97     2
98     2
99     2
100    2
101    2
Name: All_text, Length: 102, dtype: int64


In [116]:
# for i in check_df['All_text']:
#     pred = cluster_predict(i)
#     print(i, pred)
check_df['ClusterPrediction'] = check_df['All_text'].apply(lambda x: cluster_predict(x))

In [117]:
check_df

Unnamed: 0,track_artist_name,track_track_uri,track_track_name,track_album_name,playlist_name,playlist_pid,playlist_description,danceability,energy,key,...,valence,tempo,duration_ms,time_signature,song_popularity,artist_genre,artist_popularity,album_popularity,All_text,ClusterPrediction
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),The Cookbook,Throwbacks,0,,0.904,0.813,4,...,0.810,125.461,226864,4,69,dance pop hip hop hip pop neo soul pop rap r&b...,72,62,dance pop hip hop hip pop neo soul pop rap r&b...,2
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,In The Zone,Throwbacks,0,,0.774,0.838,5,...,0.924,143.040,198800,4,84,dance pop pop,80,78,dance pop pop,2
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,0,,0.664,0.758,2,...,0.701,99.259,235933,4,21,pop r&b,87,17,pop r&b,2
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,Justified,Throwbacks,0,,0.892,0.714,4,...,0.817,100.972,267267,4,79,dance pop pop,79,77,dance pop pop,2
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,Hot Shot,Throwbacks,0,,0.853,0.606,0,...,0.654,94.759,227600,4,4,dance pop pop rap reggae fusion,73,1,dance pop pop rap reggae fusion,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,BTS,spotify:track:0WNGsQ1oAuHzNTk8jivBKW,Spring Day,You Never Walk Alone,korean,2,,0.539,0.846,8,...,0.460,106.992,274097,4,1,k pop k pop boy group pop,90,1,k pop k pop boy group pop,2
98,Lovelyz,spotify:track:24psBRmEw3kHjBGZfl1dmb,Ah-Choo,Lovelyz8,korean,2,,0.729,0.898,5,...,0.517,126.977,218475,4,41,k pop k pop girl group,41,41,k pop k pop girl group,2
99,LEE HI,spotify:track:06L1apH8kLF47dbhZ4Zg9A,BREATHE,SEOULITE,korean,2,,0.609,0.246,8,...,0.371,123.773,288993,4,66,k pop,66,66,k pop,2
100,LEE HI,spotify:track:2qWgqPdW1OiAP8KSBH1b93,FXXK WIT US,SEOULITE,korean,2,,0.763,0.658,5,...,0.389,80.038,217861,4,66,k pop,66,66,k pop,2


In [663]:
# for seed in range(5):
#     model = KMeans(
#         n_clusters=true_k,
#         max_iter=500,
#         n_init=1,
#         random_state=seed,
#     ).fit(X)
#     cluster_ids, cluster_sizes = np.unique(model.labels_, return_counts=True)
#     print(f"Number of elements asigned to each cluster: {cluster_sizes}")
# print()

In [118]:
def recommend_util(artist_name, song_name):
    
    # Predict category of input string category
    chosen_song_df = check_df.loc[(check_df['track_artist_name'] == artist_name) & (check_df['track_track_name'] == song_name)]
    str_input = chosen_song_df.track_track_name.str.cat(" " + chosen_song_df.artist_genre)
        
    prediction_inp = cluster_predict(str_input)
    prediction_inp = int(prediction_inp)
    
    temp_df = check_df.loc[check_df['ClusterPrediction'] == prediction_inp]
    new_temp_df = temp_df.sample(5)
    
    return chosen_song_df[['track_artist_name', 'track_track_name']], new_temp_df[['track_artist_name', 'track_track_name']]

In [119]:
song_choice = 2
print(check_df['track_artist_name'][song_choice], check_df['track_track_name'][song_choice])
temp_df = check_df.loc[(check_df['track_artist_name'] == check_df['track_artist_name'][0]) & (check_df['track_track_name'] == check_df['track_track_name'][0])]
string_input = temp_df.track_track_name.str.cat(" " + temp_df.artist_genre)
string_input
prediction_inp = cluster_predict(string_input)
print(prediction_inp)

Beyoncé Crazy In Love
2


In [120]:
song_choice = 1
original_song , recommended_songs = recommend_util(check_df['track_artist_name'][song_choice], check_df['track_track_name'][song_choice])
print(original_song)
print('\n')
print(recommended_songs)

  track_artist_name track_track_name
1    Britney Spears            Toxic


    track_artist_name  track_track_name
101             Ailee   I Will Show You
33               Iyaz            Replay
98            Lovelyz           Ah-Choo
14             Cassie            Me & U
19    Destiny's Child  Jumpin', Jumpin'


In [228]:
# Example of how to pull metadata from a single track 
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

Row = 5 

track_uri = subset_df.iloc[Row]['track_track_uri']
artist_uri = subset_df.iloc[Row]['track_artist_uri']
album_uri = subset_df.iloc[Row]['track_album_uri']

print('ARTIST: ', subset_df.iloc[Row]['track_artist_name'])
print('ALBUM: ', subset_df.iloc[Row]['track_album_name'])
print('SONG: ', subset_df.iloc[Row]['track_track_name'])
track_popularity = spotify.track(track_uri)['popularity']
print('track_popularity: ', track_popularity)
track_results = spotify.audio_features(tracks=track_uri)
print('audio_features: ', track_results[0])
artist_results = spotify.artist(artist_uri)
print('artist_genre: ', artist_results['genres'])
print('artist_popularity: ', artist_results['popularity'])
artist_albums_results = spotify.artist_albums(artist_uri)
#print(artist_albums_results)
album_results = spotify.album(album_uri)
print('album_popularity: ',album_results['popularity'])
print('album_keys: ', album_results.keys())
# print(artist_results['genres'])
# print(artist_results['popularity'])

ARTIST:  Ron Pope
ALBUM:  The Bedroom Demos
SONG:  A Drop In The Ocean
track_popularity:  67
audio_features:  {'danceability': 0.447, 'energy': 0.393, 'key': 5, 'loudness': -8.65, 'mode': 1, 'speechiness': 0.038, 'acousticness': 0.785, 'instrumentalness': 0, 'liveness': 0.28, 'valence': 0.564, 'tempo': 73.139, 'type': 'audio_features', 'id': '5JDcQAztvZTIkrWoZihgvC', 'uri': 'spotify:track:5JDcQAztvZTIkrWoZihgvC', 'track_href': 'https://api.spotify.com/v1/tracks/5JDcQAztvZTIkrWoZihgvC', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/5JDcQAztvZTIkrWoZihgvC', 'duration_ms': 220239, 'time_signature': 3}
artist_genre:  ['neo mellow', 'piano rock', 'viral pop']
artist_popularity:  53
album_popularity:  59
album_keys:  dict_keys(['album_type', 'artists', 'available_markets', 'copyrights', 'external_ids', 'external_urls', 'genres', 'href', 'id', 'images', 'label', 'name', 'popularity', 'release_date', 'release_date_precision', 'total_tracks', 'tracks', 'type', 'uri'])


In [229]:
# Work only with important features
important_features = ['track_artist_name', 'track_track_name', 'playlist_name', 'playlist_duration_ms', 'playlist_description']

In [None]:
artist_count_df = subset_df[['playlist_name','track_artist_name']].groupby(['playlist_name'], sort=False).value_counts(sort=False).reset_index(name="artist_count")

In [None]:
artist_count_df

In [None]:
subset_df[['playlist_name', 'track_artist_name']]

In [None]:
subset_df[['playlist_name','track_artist_name']].groupby(['playlist_name'], sort = False).value_counts(sort=False).reset_index(name="artist_count").info()

In [None]:
music_df = pd.merge(subset_df, artist_count_df,  how='left', on = ['playlist_name','track_artist_name'])

In [None]:
music_df

In [None]:
# subset_df.join(.sum(), on='playlist_name', rsuffix='_count')
subset_df.groupby(['playlist_name','track_artist_name']).size().unstack(fill_value=0)

In [None]:
music_df[['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1)

In [None]:
important_features.append("artist_count")

In [None]:
music_df[important_features]

In [None]:
music_df['artist_fraction'] =  music_df['artist_count'] / music_df['playlist_num_tracks'] 
music_df

In [None]:
subset_df.groupby(['playlist_name','track_artist_name']).size().unstack(fill_value=0)

In [None]:
music_df[music_df['playlist_num_tracks'] == Ntracks][['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1).value_counts() 

In [None]:
vc = music_df[music_df['playlist_num_tracks'] == Ntracks][['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x), axis=1).value_counts()
vc[vc > 1].sum() - len(vc[vc > 1])

In [None]:
# for i in df[['track_artist_name','track_track_name']].apply(lambda x: ',.'.join(x),axis=1).unique():
# #     print(i.split(',.'))
#     if len(i.split(',.')) == 2:
#         print(i)

In [None]:
np.unique(music_df[music_df['playlist_num_tracks'] == Ntracks]['track_track_name'])

In [None]:
important_features = ['track_artist_name', 'track_track_name', 'playlist_name', 'playlist_num_tracks', 'playlist_num_albums', 'playlist_duration_ms', 'playlist_num_artists']

In [None]:
music_df[music_df['playlist_num_tracks'] == 20][['track_artist_name', 'track_track_name', 'playlist_name', 'playlist_num_tracks', 'playlist_num_albums', 'playlist_duration_ms', 'playlist_num_artists', 'playlist_description']]

In [None]:
music_df[music_df['playlist_num_tracks'] == Ntracks][important_features].groupby('playlist_name')['track_artist_name'].value_counts()