In [1]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipynb
from scipy.spatial import distance_matrix
import random
import sys as sys
import time

In [2]:
from ipynb.fs.full.datamanagement import user_filter
from ipynb.fs.full.datamanagement import summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

## Data loading

In [3]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [4]:
df.head()

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,-7.933,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24
1,9fba771d9731561eba47216f6fbfc0023d88641b,19,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,-7.933,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24
2,85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b,1,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,-7.933,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24
3,537340ff896dea11328910013cfe759413e1eeb3,2,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,-7.933,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24
4,8fce200f3912e9608e3b1463cdb9c3529aab5c08,2,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,-7.933,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24


In [4]:
df['track_id'] = df.song_name + '-' + df.artist_name

In [6]:
df.head()

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
1,9fba771d9731561eba47216f6fbfc0023d88641b,19,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
2,85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b,1,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
3,537340ff896dea11328910013cfe759413e1eeb3,2,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
4,8fce200f3912e9608e3b1463cdb9c3529aab5c08,2,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West


In [5]:
len(df)

4982520

## Data management

In [6]:
## keep first occurence of a given song/artist instance to reduce redundant observations

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

In [9]:
df.head()

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
1,9fba771d9731561eba47216f6fbfc0023d88641b,19,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
2,85952991b8e3ca5803a08b0b2f9c6d71abf9bb5b,1,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
3,537340ff896dea11328910013cfe759413e1eeb3,2,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West
4,8fce200f3912e9608e3b1463cdb9c3529aab5c08,2,Stronger,Graduation,Kanye West,311.84934,0.877214,1.082503,2007,-1,...,0,0.176,0.00763,0.0,0.526,0.495,103.977,4.0,2011-05-24,Stronger-Kanye West


### Users filtering

In [7]:
users_summary = summarise_listening_history(df)

In [8]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 500, ntracks_min = 5)

In [9]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [13]:
print('Nb of ligns in df  :', len(df))
print('Nb of ligns in wdf :', len(wdf))

Nb of ligns in df  : 4973744
Nb of ligns in wdf : 3856377


In [16]:
len(wdf_users_summary)

327081

In [13]:
wdf_users_summary.head()

Unnamed: 0,user,listening_count,track_count
7,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10
11,0000bb531aaa657c932988bc2f7fd7fc1b2050ec,14,10
12,0000d3c803e068cf1da17724f1674897b2dd7130,7,5
14,0000f88f8d76a238c251450913b0d070e4a77d19,30,8
15,000138e252eea35fd73aaf66a9b34102b695a9c8,26,13


### Songs management

In [10]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + '-' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration', 'tempo'], axis = 1).drop_duplicates()

In [11]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)


In [27]:
# Compute the pairwise distance matrix between the songs
D = pd.DataFrame(distance_matrix(X, X), index = X.index, columns = X.index)

## Raw K-Nearest Neighboor

### Split the data into hidden and apparent sets

In [12]:
wdf_sorted = wdf.sort_values('user')
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')

In [21]:
# Try 1; Triplets non sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 1:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 1: 42.95786738395691 seconds


In [22]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 36.215386390686035 seconds


In [23]:
# Try 3: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 3:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 21.979166746139526 seconds


In [13]:
# Try 4: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, users_summary = wdf_users_summary_sorted, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 4:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 4: 10.384645938873291 seconds


In [103]:
def random_k_reco(triplets, list_tracks, k):
    users = triplets.user.unique()
    reco = np.concatenate([random.sample(list(set(list_tracks) - set(triplets.loc[triplets.user == i, :].track_id)), k) for i in users])
    return pd.DataFrame({'user' : [i for i in users for j in range(k)],'track_id' : reco})

In [106]:
sample = random.sample(list(test1.user.unique()), 1000)

In [107]:
sum(test1.user.isin(sample))

9586

In [108]:
df_apparent_sample =  test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]

In [127]:
reco = random_k_reco(df_apparent_sample, X.index, 1000)

In [130]:
def scoring_accuracy_vs_serendipity(hidden_triplets, recommended_triplets):
    accuracy = [1-(len(set(hidden_triplets.loc[hidden_triplets.user == i,:].track_id)-set(recommended_triplets.loc[recommended_triplets.user == i,:].track_id)))/(len(set(hidden_triplets.loc[hidden_triplets.user == i,:].track_id))) for i in hidden_triplets.user.unique()]
    serendipity = [(len(set(recommended_triplets.loc[recommended_triplets.user == i,:].track_id) - set(hidden_triplets.loc[hidden_triplets.user == i,:].track_id)))/(len(set(recommended_triplets.loc[recommended_triplets.user == i,:].track_id)))for i in hidden_triplets.user.unique()]
    return accuracy, serendipity

In [131]:
acc, ser = scoring_accuracy_vs_serendipity(df_hidden_sample, reco)

In [139]:
pd.Series(acc).describe()

count    1000.000000
mean        0.033309
std         0.140897
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
dtype: float64

## Work in progress

In [None]:
def get_opt_clusters(df:pd.DataFrame, drop : list[str], user_id: str):
    
    # definition of df_user
    df_user = df[df['user']==user_id].drop(drop, axis=1).reset_index().drop(['index'], axis=1)

    # weights to apply to the kmeans
    user_listenings = df_user['listening_count']

    # eliminate weights from acoustic characteristics
    df_user = df_user.drop('listening_count', axis=1) 
    

    # broad exploration of number of clusters
    dist = []
    s_scores = []
    range = np.arange(2,17)
    for i in range:
        clf = KMeans(n_clusters = i, n_init = 'auto', random_state=123)
        clf.fit(df_user, sample_weight=user_listenings)
        centroids = clf.cluster_centers_
        labels = clf.labels_
        score = silhouette_score(df_user, labels, metric='euclidean')
        s_scores.append(score)
        dist.append(sum(np.min(cdist(XA = centroids, XB = df_user, metric='euclidean'), axis = 0)) / len(df_user))
    
    # optimal number of clusters selection by silhouette score
    n_clusters = s_scores.index(max(s_scores))+2

    return range, s_scores, dist, n_clusters

In [None]:
def get_user_profiles(df: pd.DataFrame, drop : list[str],  user_id: str, n_clusters:int):
    """
    Takes a user_id string and 1 dataframe containg all original informations and whose acoustic characteristics are Standard scaled.
    Returns the acoustic characteristics of the user standard profile weighted by number of listenings (1 acoustic profile)   
    and the cluster centers centroids profiles based on Kmeans fit on optimal number of cluster found per user (n_clusters profiles)

    Parameters:
        df : whole dataframe with all acoustic characteristics Standard scaled
        user_id : str corresponding to the user id
   
    Return :
        
        standard_profile: list[list[float]] of size 1
        cluster_profile: list[list[float]] of size n_clusters
    """
    
    
    total_listening = df[df["user"] == user_id]["listening_count"].sum() 

    ## Standard profile: one profile where acoustic characteristics are weighted by listening_count
    duration = 0
    dance = 0
    energy = 0
    loudness = 0
    speech = 0
    acoustic = 0
    instru = 0
    live = 0
    valence = 0
    tempo = 0


    # cumulative sum of user's  songs acoustic characteristics weighted by listening_count
    for index, row, in df[df["user"] == user_id].iterrows():
        #duration += row['duration'] * row["listening_count"]
        dance += row['danceability'] * row["listening_count"]
        energy += row['energy'] * row["listening_count"]
        loudness += row['loudness'] * row["listening_count"]
        speech += row['speechiness'] * row["listening_count"]
        acoustic += row['acousticness'] * row["listening_count"]
        instru += row['instrumentalness'] * row["listening_count"]
        live += row['liveness'] * row["listening_count"]
        valence += row['valence'] * row["listening_count"]
        tempo += row['tempo'] * row["listening_count"]

    # duration / total_listening, 

    # weighted average from cumulative sum of acoustic characteristics / total_listening to list of lists
    standard_profile = [[dance / total_listening, energy / total_listening,
                        loudness / total_listening, speech / total_listening, acoustic / total_listening,
                        instru / total_listening, live / total_listening, valence / total_listening,
                        tempo / total_listening]]

    

    ## Cluster profile
    
    # definition of df_user
    df_user = df[df['user']==user_id].drop(drop, axis=1).reset_index().drop(['index'], axis=1)

    # weights to apply to the kmeans
    user_listenings = df_user['listening_count']

    # eliminate weights from acoustic characteristics
    df_user = df_user.drop('listening_count', axis=1) 

    # training of kmeans with optimal number of clusters
    clf = KMeans(n_clusters = n_clusters, n_init = 'auto', random_state=123)
    clf.fit(df_user, sample_weight=user_listenings)
    centroids = clf.cluster_centers_
    labels = clf.labels_

    # distinct profiles as cluster centers centroids to list
    cluster_profile = centroids.tolist()

    
    return standard_profile, cluster_profile

In [None]:
def get_profile_neighbors(df_features, df_features_names: pd.DataFrame, profile: list[list[float]], n_target : int = n_neighbors):
    
    user_profile = pd.DataFrame(columns=["danceability", "energy",
                                   "loudness", "speechiness", "acousticness",
                                   "instrumentalness", "liveness", "valence",
                                   "tempo"])
    

    # append profile to the dataframe user_profile
    for i in range(len(profile)):
        user_profile.loc[len(user_profile)] = profile[i]


    # compute the distance matrix between all user_profiles and all songs features
    dist_mat = pd.DataFrame(distance_matrix(user_profile, df_features).T)
    #dist_mat = dist_mat.rename(columns={0 : 'distance'})
    

    # create similarity and disimilarity lists
    sim = []
    dis = []
    sim_id = []
    dis_id =[]

    # broadcast distance of matrix[i] to df_features_names containing song and artist names and sort values by distance[i]
    for i in range(len(profile)):
        df_feats_names = df_features_names
        df_feats_names[i] = dist_mat[i]
        df_feats_names = df_feats_names.sort_values(by=i, ascending=True).reset_index().drop(['index'], axis=1)
        
        # for each profile create temporary sim and disim lists
        s = []
        d = []
        s_id = []
        d_id = []

        ## append each profile list the profile closest and furthest n_neighbors tracks and corresponding song-artist names 
        for j in range (n_target):
            s.append(df_feats_names.iloc[j, :9].values.flatten().tolist())
            s_id.append(' - '.join((df_feats_names.iloc[j, 9:11])))
            
            d.append(df_feats_names.iloc[len(df_feats_names)-1-j, :9].values.flatten().tolist())
            d_id.append(' - '.join((df_feats_names.iloc[len(df_feats_names)-1-j, 9:11])))
        
        
        sim.append(s)
        sim_id.append(s_id)
        dis.append(d)
        dis_id.append(d_id)

    return sim, dis, sim_id, dis_id

In [14]:
triplets = wdf_sorted[wdf_sorted.user == wdf_sorted.user[0]]

In [18]:
triplets.head()

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
27188,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Ragged Wood,Fleet Foxes,Fleet Foxes,307.17342,0.775166,0.547609,2008,-1,...,1,0.0296,0.105,0.0133,0.0585,0.156,104.732,4.0,2018-11-09,Ragged Wood-Fleet Foxes
45162,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Drive,Make Yourself,Incubus,232.46322,0.89928,0.579442,1999,-1,...,0,0.039,0.0591,0.00994,0.166,0.689,90.557,4.0,2007-06-05,Drive-Incubus
59295,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,He Doesn't Know Why,Fleet Foxes,Fleet Foxes,200.46322,0.775166,0.547609,2008,-1,...,1,0.0303,0.265,7e-05,0.336,0.407,93.942,4.0,2018-11-09,He Doesn't Know Why-Fleet Foxes
30658,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Mykonos,Sun Giant,Fleet Foxes,275.27791,0.775166,0.547609,2008,-1,...,1,0.0305,0.395,0.000834,0.137,0.342,82.985,4.0,2018-11-09,Mykonos-Fleet Foxes
22873,b80344d063b5ccb3212f76538f3d9e43d87dca9e,1,Gimme Stitches,There Is Nothing Left To Lose,Foo Fighters,222.1971,0.839275,0.550762,1999,-1,...,1,0.0685,0.00175,0.00286,0.263,0.532,111.85,4.0,1999-11-02,Gimme Stitches-Foo Fighters


In [21]:
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Stronger-Kanye West,0.667073,0.244482,0.234685,0.971940,-0.825165,-0.623569,1.661334,0.055057
Stacked Actors-Foo Fighters,-0.048441,1.150060,0.862181,-0.294877,-0.842482,-0.620462,0.105924,0.728483
Clarity-John Mayer,0.868312,0.283685,1.220943,-0.411504,-0.316887,-0.526080,-0.197701,0.039664
Gimme Stitches-Foo Fighters,0.180747,1.244146,1.071496,-0.108876,-0.843348,-0.614856,0.260399,0.197438
Breakout-Foo Fighters,-0.501227,1.165741,1.072171,-0.272758,-0.848599,-0.614704,-0.480018,-0.206617
...,...,...,...,...,...,...,...,...
Dime-Jerry Rivera,0.991291,-0.429800,-0.219056,-0.392401,1.575591,-0.623569,0.015369,1.247983
The Long Conversation-Pierre de Reeder,-0.395018,-1.370660,-0.236161,-0.523105,1.705467,-0.520901,0.825034,-1.007032
That's the Way That It Was-Pierre de Reeder,0.583224,0.228801,0.703957,-0.425580,-0.754445,-0.623329,-0.543939,-0.679940
I'll Close My Eyes-Mike Jones,-0.283219,-2.126092,-3.399071,-0.445688,2.187863,2.115258,3.328605,-0.899284


In [49]:
def kmeans_based_knn(triplets, X, n_neighbors, weighted = True, n_clusters = 'auto', random_state = None):  
    # Get the features of the songs listened by the user
    triplets_feats = X.loc[triplets.track_id]      
    
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = triplets.listening_count / triplets.listening_count.sum()
        w.index = triplets.index 
        # Tracks weighting
        triplets_feats = triplets_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ = get_triplets_partition(triplets_feats)
    elif n_clusters == 1:
        labels = np.zeros(len(triplets))
        centroids = triplets_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clusters, n_init = 'auto', random_state = random_state)
        clf.fit(triplets_feats)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    D = pd.DataFrame(distance_matrix(centroids, X.loc[X.index.difference(triplets_feats.index)]), columns = X.index.difference(triplets_feats.index))           
    
    R = D.rank(axis = 1)
    R['centroid'] = np.arange(len(R))

    R = R.melt(id_vars = 'centroid', var_name = 'track', value_name = 'rank')
    n_neighbors = 20

    cluster_size = pd.Series(labels).value_counts()

    nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    if not sum(n_select_by_clusters) == n_neighbors:
        nb_tracks_by_clusters[-1] = n_neighbors - sum(n_select_by_clusters[:-1])

    R['nb_tracks'] = [nb_tracks_by_clusters[i] for i in R.centroid]
    return(list(R.loc[R['rank'] <= R['nb_tracks']].track))
    


[1;31mInit signature:[0m
[0mKMeans[0m[1;33m([0m[1;33m
[0m    [0mn_clusters[0m[1;33m=[0m[1;36m8[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0minit[0m[1;33m=[0m[1;34m'k-means++'[0m[1;33m,[0m[1;33m
[0m    [0mn_init[0m[1;33m=[0m[1;34m'warn'[0m[1;33m,[0m[1;33m
[0m    [0mmax_iter[0m[1;33m=[0m[1;36m300[0m[1;33m,[0m[1;33m
[0m    [0mtol[0m[1;33m=[0m[1;36m0.0001[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcopy_x[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0malgorithm[0m[1;33m=[0m[1;34m'lloyd'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
K-Means clustering.

Read more in the :ref:`User Guide <k_means>`.

Parameters
----------

n_clusters : int, default=8
    The number of clusters to form as well as the number of
    centroi

In [30]:
triplets_feats = X.loc[triplets.track_id]
triplets_feats

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ragged Wood-Fleet Foxes,-0.831034,0.064151,0.530878,-0.49998,-0.52407,-0.58305,-0.82892,-1.249466
Drive-Incubus,0.711793,0.852121,0.816942,-0.405472,-0.666006,-0.593287,-0.256295,0.801598
He Doesn't Know Why-Fleet Foxes,-0.054031,-0.390598,0.620005,-0.492942,-0.029304,-0.623356,0.649252,-0.28358
Mykonos-Fleet Foxes,-1.177612,-0.7395,-0.072535,-0.490932,0.372693,-0.621028,-0.410771,-0.53371
Gimme Stitches-Foo Fighters,0.180747,1.244146,1.071496,-0.108876,-0.843348,-0.614856,0.260399,0.197438
Clarity-John Mayer,0.868312,0.283685,1.220943,-0.411504,-0.316887,-0.52608,-0.197701,0.039664
All That We Perceive-Thievery Corporation,1.0416,0.758035,0.563963,-0.459764,-0.767433,2.231026,-0.713863,1.840598
Stronger-Kanye West,0.667073,0.244482,0.234685,0.97194,-0.825165,-0.623569,1.661334,0.055057
Bigger Isn't Better-The String Cheese Incident,1.10309,-0.923752,-1.811877,-0.482888,1.269455,0.820484,-0.701078,1.086361
Are You In?-Incubus,0.085718,-0.210267,0.054629,-0.405472,-0.106611,0.116737,3.578962,-0.449051


In [55]:
D = pd.DataFrame(distance_matrix(centroids, X.loc[X.index.difference(triplets_feats.index)]), columns = X.index.difference(triplets_feats.index))           

In [121]:

# while True:
#     R.loc[R['rank'] <= 10]

['Be With-Koushik',
 'Blind Revolution Mad-Winger',
 'Complicated-Avril Lavigne',
 "Eyes Won't See-Austin Collins",
 'Heaven (Only Knows)-Streetwize',
 'Heiterkeit-PeterLicht',
 'Jumpin´ The Blues-Maceo Parker',
 'Just Like Anyone-Soul Asylum',
 'Last Cigarette-A.R.E. Weapons',
 "Looters' Follies-Destroyer",
 'Mentir-Gabinete Caligari',
 'Myself-Seminole County',
 'O Come All Ye Faithful-Third Day',
 'Pocahontas Proud-Gretchen Wilson',
 'Summer-A&E',
 'Then the Letting Go-The Mountain Goats',
 'Third Party-Unter Null',
 'Tällaisena Kesäyönä-Scandinavian Music Group',
 'You Are My Starship-Norman Connors',
 "Your Cover's Blown-Belle & Sebastian"]

In [33]:
def get_triplets_partition(triplets_feats, max_clusters = 10):
    s_scores = []
    labels = []
    centroids = []
    range = np.arange(2, min(max_clusters, len(triplets_feats)))
    
    # Search the optimal number of clusters
    for i in range:
        clf = KMeans(n_clusters = i, n_init = 'auto', random_state=123)
        clf.fit(triplets_feats)
        centroids.append(clf.cluster_centers_)
        labels.append(clf.labels_)
        score = silhouette_score(triplets_feats, labels[i-2], metric='euclidean')
        s_scores.append(score)
    
    # Define the optimal number of clusters from the silhouette score
    i_clusters_opt = s_scores.index(max(s_scores))
    
    # Return the corresponding partition, 
    return labels[i_clusters_opt], centroids[i_clusters_opt], s_scores[i_clusters_opt]

In [32]:
get_triplet_partition(triplets_feats)

(array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]),
 array([[-0.03120504,  0.35914953,  0.64151838, -0.2777847 , -0.52661788,
         -0.54205681,  0.34220923, -0.12195798],
        [ 0.46583509, -0.80980317, -1.27485968, -0.45708282,  0.84818285,
          0.80932675, -0.64585787,  0.52196577]]),
 0.3762583221316277)