In [1]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

In [3]:
from ipynb.fs.full.datamanagement import user_filter, summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history

In [4]:
def get_listenings_history_partition(listenings_history_feats, max_clusters = 15, random_state = None):
    s_scores = []
    labels = []
    centroids = []
    n_clusters_range = np.arange(2, min(max_clusters, len(listenings_history_feats) - 1))
    
    # Search the optimal number of clusters
    for i in n_clusters_range:
        clf = KMeans(n_clusters = i, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats)
        centroids.append(clf.cluster_centers_)
        labels.append(clf.labels_)
        score = silhouette_score(listenings_history_feats, labels[i-2], metric='euclidean')
        s_scores.append(score)
    
    # Define the optimal number of clusters from the silhouette score
    i_clusters_opt = s_scores.index(max(s_scores))
    n_cluster_opt = i_clusters_opt +2 

    # Return the corresponding partition, 
    return labels[i_clusters_opt], centroids[i_clusters_opt], s_scores[i_clusters_opt], n_cluster_opt

In [5]:
def kmeans_based_knn(listenings_history, X, n_neighbors, weighted = False, n_clusters ='Default',  random_state = None):  
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    _, _ , _, n_clust_opt =  get_listenings_history_partition(listenings_history_feats)
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ , n_clust_opt = get_listenings_history_partition(listenings_history_feats)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clust_opt, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight=listenings_history.listening_count)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    cluster_size = pd.Series(labels).value_counts()
    nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    if not sum(nb_tracks_by_clusters) == n_neighbors:
        nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

    # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

    # Get the n_neighbors unique recommended tracks
    recommended_tracks = []
    for i, n in enumerate(nb_tracks_by_clusters):
        tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
        recommended_tracks = recommended_tracks + tracks[:n]

    return(recommended_tracks)

In [6]:
def kmeans_based_ranking(listenings_history, X,  weighted = False, n_clusters="Default",  random_state = None):  
    """
    Define the centroid(s) representing the average audio features of the tracks listened by a user.
    Estimate the affinity of the user for tracks that he didn't listened based on their distance with this (these) centroid(s).
    Return the rank of the unlistened tracks based on this afinity.
    
    Parameters:
        listening_history (pandas.Dataframe): a dataframe whose columns are
            - user : a unique id of the user
            
            - track_id : a unique id for a track
            
            - listening_count: the number of times the user has listened to the track
            
            ... : other columns corresponding to track's features and/or the user's features and/or element of context of the interaction user/track.
        
        X (pandas.Dataframe): a dataframe corresponding to the audio_features of the tracks.
        
        weighted (boolean, default = True): determines whether the centroid calculation is weighted by the listens number of the tracks.    
         
        n_clusters (int or 'auto'): define the number of centroids to form. If 'auto', it is determine as the arg max of the silhouette score.
        
        
        random_state (int, default=None): pass an int for reproducible output across multiple function calls.
        
    Return :
       R : the ranks of each tracks.
    """
    
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    
    _, _ , _, n_clust_opt =  get_listenings_history_partition(listenings_history_feats)
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ ,n_clust_opt = get_listenings_history_partition(listenings_history_feats)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clust_opt, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight=listenings_history.listening_count)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    # cluster_size = pd.Series(labels).value_counts()
    # nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    # if not sum(nb_tracks_by_clusters) == n_neighbors:
    #     nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

#     # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

#     # Get the n_neighbors unique recommended tracks
#     recommended_tracks = []
#     for i, n in enumerate(nb_tracks_by_clusters):
#         tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
#         recommended_tracks = recommended_tracks + tracks[:n]

    return R

In [7]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [8]:
df['track_id'] = df.song_name + ' - ' + df.artist_name

In [9]:
## keep first occurence of a given song/artist instance to reduce redundant observations does not apply to the full df but to tracks_feats
#df = df.drop_duplicates(subset = ['song_name', 'artist_name'], keep='first').reset_index()

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

### Users filtering

In [10]:
users_summary = summarise_listening_history(df)

In [11]:
users_summary

Unnamed: 0,user,listening_count,track_count
0,0000175652312d12576d9e6b84f600caa24c4715,2,2
1,00001cf0dce3fb22b0df0f3a1d9cd21e38385372,5,4
2,0000267bde1b3a70ea75cf2b2d216cb828e3202b,5,4
3,00003a4459f33b92906be11abe0e93efc423c0ff,4,2
4,00004fb90a86beb8bed1e9e328f5d9b6ee7dc03e,3,3
...,...,...,...
849204,ffffdc6c89988cd6119067769162948eacf8b670,9,2
849205,ffffe07df4bb5fd929efe42c5728f3a0c1621277,3,3
849206,fffff3e690fcda840b716ce7249d8935ff3323fc,4,1
849207,fffff67d54a40927c93d03bd6c816b034b59f087,15,10


In [12]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 3000, ntracks_min = 10)

In [13]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [14]:
wdf_users_summary.head()

Unnamed: 0,user,listening_count,track_count
7,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10
11,0000bb531aaa657c932988bc2f7fd7fc1b2050ec,14,10
15,000138e252eea35fd73aaf66a9b34102b695a9c8,26,13
28,00028f3cff4872bff3e9985cfa32e01a8d54e374,94,21
34,0002e94348b2543c6e6ccf408b0160d14064e46f,106,38


In [15]:
wdf_users_summary.loc[wdf_users_summary.user == '22e08d5e101ab5b86dc394856d508e175a5242a6']

Unnamed: 0,user,listening_count,track_count
114964,22e08d5e101ab5b86dc394856d508e175a5242a6,1603,29


In [16]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + ' - ' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration'], axis = 1).drop_duplicates() # , 'tempo'

In [17]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)

In [18]:
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Stronger - Kanye West,0.667073,0.244482,0.234685,0.971940,-0.825165,-0.623569,1.661334,0.055057,-0.585837
Stacked Actors - Foo Fighters,-0.048441,1.150060,0.862181,-0.294877,-0.842482,-0.620462,0.105924,0.728483,0.441564
Clarity - John Mayer,0.868312,0.283685,1.220943,-0.411504,-0.316887,-0.526080,-0.197701,0.039664,-0.893749
Gimme Stitches - Foo Fighters,0.180747,1.244146,1.071496,-0.108876,-0.843348,-0.614856,0.260399,0.197438,-0.324244
Breakout - Foo Fighters,-0.501227,1.165741,1.072171,-0.272758,-0.848599,-0.614704,-0.480018,-0.206617,1.169095
...,...,...,...,...,...,...,...,...,...
Dime - Jerry Rivera,0.991291,-0.429800,-0.219056,-0.392401,1.575591,-0.623569,0.015369,1.247983,-1.122880
The Long Conversation - Pierre de Reeder,-0.395018,-1.370660,-0.236161,-0.523105,1.705467,-0.520901,0.825034,-1.007032,-0.805067
That's the Way That It Was - Pierre de Reeder,0.583224,0.228801,0.703957,-0.425580,-0.754445,-0.623329,-0.543939,-0.679940,0.473561
I'll Close My Eyes - Mike Jones,-0.283219,-2.126092,-3.399071,-0.445688,2.187863,2.115258,3.328605,-0.899284,-1.731493


### Split the data into hidden and apparent sets

In [19]:
wdf_sorted = wdf.sort_values('listening_count', ascending=False)
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')
wdf_users_summary_sorted['listening_track_ratio'] = wdf_users_summary_sorted['listening_count'] / wdf_users_summary_sorted['track_count']

In [20]:
wdf_users_summary_sorted = wdf_users_summary_sorted.sort_values(by=['listening_track_ratio', 'track_count'], ascending=False)
wdf_users_summary_sorted.head(50)

Unnamed: 0,user,listening_count,track_count,listening_track_ratio
375966,716ed1ec67d67bfa05db3ffeb641d13f46dca6ec,1143,10,114.3
352324,6a46aee45cc177cf8e2025e59d21c7939902deee,1379,16,86.1875
580888,af3ee32357049dd96231238bd1b019e8142ee6aa,1553,19,81.736842
224432,43e241eaa4d079ab2bdbd30231695f5563583af4,727,11,66.090909
210372,3fa44653315697f42410a30cb766a4eb102080bb,1506,25,60.24
114964,22e08d5e101ab5b86dc394856d508e175a5242a6,1603,29,55.275862
690540,d035c4a2b179ef8c756d73f9f3018ec7a87d8594,548,10,54.8
514797,9b3b5c53b64cbe8f36a5ec11aad76c09cbf3e57d,689,13,53.0
781689,eba8c4d803d654fef4cda5d8f4f70f5acea918a1,757,15,50.466667
96195,1d36c586adf56c1be114fb173bf23ececa79e9c7,750,15,50.0


In [21]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 9.299600839614868 seconds


In [22]:
n_neighbors = [10, 25, 50, 100, 150, 200]
sample = list(wdf_users_summary_sorted.loc[wdf_users_summary_sorted.track_count > 9].user)

In [23]:
sample

['716ed1ec67d67bfa05db3ffeb641d13f46dca6ec',
 '6a46aee45cc177cf8e2025e59d21c7939902deee',
 'af3ee32357049dd96231238bd1b019e8142ee6aa',
 '43e241eaa4d079ab2bdbd30231695f5563583af4',
 '3fa44653315697f42410a30cb766a4eb102080bb',
 '22e08d5e101ab5b86dc394856d508e175a5242a6',
 'd035c4a2b179ef8c756d73f9f3018ec7a87d8594',
 '9b3b5c53b64cbe8f36a5ec11aad76c09cbf3e57d',
 'eba8c4d803d654fef4cda5d8f4f70f5acea918a1',
 '1d36c586adf56c1be114fb173bf23ececa79e9c7',
 '8084aef08dffb1c0323bc6af17f80b3cd9e2e7f3',
 'c10a4319546cf80c191537752a79b88ede89d05d',
 '020f1d81ece6502a83e44c680aa9f23eff4330cb',
 'd28af1bcf1166c19f8c72b5340190c835baa329f',
 '7053ce47137ef3e711a0b1851a3561086d18c396',
 'b7318e63c35a23d8914f0751493d72b697d1d7f6',
 '1169ba1ed18d59612bf3b340540f5e2ffd997a5d',
 'd7916ae0f712c81bd9cbd015be4013d11a777fe0',
 '151d0a4e715ef1c07939da142f5704e7239a9911',
 '58b3b3be8fdb6f04701b1262a2793ef3e8f286fb',
 '5a828f20d611efe2d818ca48edf3b792f4e69e66',
 'e75ad789de4cc7c244ee203c2caf6e1536a14a95',
 'f2a2da6a

In [24]:
df_apparent_sample = test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]

In [25]:
df_user_apparent = df_apparent_sample.loc[df_apparent_sample.user == '22e08d5e101ab5b86dc394856d508e175a5242a6']
#df_user_apparent = df_apparent_sample.loc[df_apparent_sample.user == '3325fe1d8da7b13dd42004ede8011ce3d7cd205d']



In [26]:
df_user_hidden = df_hidden_sample.loc[df_hidden_sample.user == '22e08d5e101ab5b86dc394856d508e175a5242a6']


In [27]:
#sample[0]],
'''
test = kmeans_based_ranking(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == '22e08d5e101ab5b86dc394856d508e175a5242a6'], 
        X = X,
        #n_neighbors = n_neighbors[3],
        random_state = 1234)
'''
user_rank = kmeans_based_ranking(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == "22e08d5e101ab5b86dc394856d508e175a5242a6"], 
        X = X,
        #n_neighbors = n_neighbors[3],
        random_state = 1234)

user_rank = user_rank.reset_index()
#test_user = test[test.index == df_apparent_sample.loc[df_apparent_sample.user == sample[0]]]

#kmeans_based_ranking(listenings_history, X, weighted = False, n_clusters = 'Default', random_state = None):  

In [28]:
user_rank[user_rank['track_id'].isin(df_user_hidden['track_id'])]

Unnamed: 0,track_id,0,1,2,3,4,5,6,7,8,9,10
6663,Emma - Hot Chocolate,11398.0,8066.0,11478.0,13347.0,11266.0,5095.0,18567.0,8329.0,13463.0,115.0,18771.0
9713,Hits - Dwarves,15136.0,21137.0,17085.0,10249.0,11908.0,22434.0,1533.0,22935.0,11476.0,25946.0,1975.0
10725,I Wish You Well - Anna Waronker,7444.0,3860.0,5611.0,4673.0,8836.0,2238.0,13686.0,2110.0,5546.0,4482.0,10071.0
16369,Nothing Personal - Anna Waronker,11866.0,2600.0,4333.0,6505.0,5559.0,4960.0,4682.0,1781.0,6671.0,6431.0,8086.0
24343,Throw That World Away - Dwarves,22425.0,15363.0,2739.0,5199.0,23353.0,19282.0,365.0,25808.0,10678.0,25939.0,6.0
27216,You Gotta Burn - Dwarves,8642.0,2676.0,3733.0,1875.0,3804.0,5001.0,8591.0,3186.0,435.0,10175.0,4211.0


In [29]:
'''
user_rank['artist'] = user_rank['track_id'].str.split(' - ').str[1]
user_rank
'''
#user_rank = user_rank.drop('artist', axis=1)

#user_rank = user_rank.drop('min', axis=1)
user_rank.insert(0, 'artist_name', user_rank['track_id'].str.split(' - ').str[1])
user_rank['min_centroids'] =  np.nan
user_rank['artist_rank'] =  np.nan
user_rank

Unnamed: 0,artist_name,track_id,0,1,2,3,4,5,6,7,8,9,10,min_centroids,artist_rank
0,Askeleton,$ VS. Entertainment - Askeleton,5075.0,20821.0,21737.0,20805.0,24431.0,16427.0,24866.0,15908.0,22381.0,8245.0,23533.0,,
1,Gram Parsons,$1000 Wedding - Gram Parsons,6164.0,18085.0,21875.0,21684.0,4243.0,21195.0,22513.0,5214.0,16350.0,12415.0,19922.0,,
2,The Aluminum Group,$35 - The Aluminum Group,749.0,24819.0,25034.0,22619.0,20682.0,24721.0,25659.0,16713.0,22211.0,22276.0,21139.0,,
3,Shitmat,'94 Abyss - Shitmat,18202.0,18675.0,17189.0,10295.0,17050.0,14858.0,9404.0,14660.0,13763.0,19608.0,12855.0,,
4,Latterman,'Dozer Rage - Latterman,15809.0,5562.0,3718.0,3739.0,4665.0,5531.0,8058.0,11946.0,2788.0,11356.0,6255.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27579,Santana,É Papa Ré - Santana,2953.0,11611.0,9427.0,3036.0,15832.0,11984.0,10583.0,2998.0,7046.0,16623.0,3992.0,,
27580,James Dean Bradfield,Émigré - James Dean Bradfield,6165.0,5145.0,7828.0,3838.0,6915.0,7458.0,12131.0,5436.0,1944.0,13105.0,5496.0,,
27581,Ska-P,Ñapaes - Ska-P,16619.0,935.0,4738.0,6877.0,5498.0,7288.0,7546.0,4592.0,3835.0,5554.0,7937.0,,
27582,Ayo,Översättning - Ayo,21933.0,5895.0,11224.0,15459.0,9122.0,5459.0,18068.0,11172.0,12202.0,2252.0,18833.0,,


In [None]:
## safe

In [30]:
user_artists = (df_user_apparent.groupby('artist_name')['track_id'].count()/len(df_user_apparent)).sort_values(ascending=False)

for i, artist in enumerate(user_artists.index):
    print(artist,' ', user_artists.index[i], ' ' ,user_artists[artist])


Dwarves   Dwarves   0.2608695652173913
The Wonders   The Wonders   0.21739130434782608
Cocteau Twins   Cocteau Twins   0.13043478260869565
Anna Waronker   Anna Waronker   0.043478260869565216
Foo Fighters   Foo Fighters   0.043478260869565216
MGMT   MGMT   0.043478260869565216
Meat Loaf   Meat Loaf   0.043478260869565216
Percy Sledge   Percy Sledge   0.043478260869565216
Pixies   Pixies   0.043478260869565216
Sloppy Seconds   Sloppy Seconds   0.043478260869565216
The Exciters   The Exciters   0.043478260869565216
The Teddy Bears   The Teddy Bears   0.043478260869565216


In [31]:
#user_rank[user_rank['artist'] == user_artists.index[0]].iloc[:, 2:-2]
user_rank_artist = user_rank.drop(index=user_rank.index, axis=0)
for i, artist in enumerate(user_artists.index):
    user_temp = user_rank[user_rank['artist_name'] == user_artists.index[i]]
    user_temp['min_centroids'] = user_temp.iloc[:, 2:-2].min(axis=1)
    user_temp = user_temp.sort_values(by='min_centroids')
    user_temp['artist_rank'] = user_temp['min_centroids'].rank(method = 'first', ascending=True) + len(user_rank_artist)
    user_rank_artist = pd.concat([user_rank_artist,user_temp])
 
    
user_rank_artist

Unnamed: 0,artist_name,track_id,0,1,2,3,4,5,6,7,8,9,10,min_centroids,artist_rank
24343,Dwarves,Throw That World Away - Dwarves,22425.0,15363.0,2739.0,5199.0,23353.0,19282.0,365.0,25808.0,10678.0,25939.0,6.0,6.0,1.0
27216,Dwarves,You Gotta Burn - Dwarves,8642.0,2676.0,3733.0,1875.0,3804.0,5001.0,8591.0,3186.0,435.0,10175.0,4211.0,435.0,2.0
9713,Dwarves,Hits - Dwarves,15136.0,21137.0,17085.0,10249.0,11908.0,22434.0,1533.0,22935.0,11476.0,25946.0,1975.0,1533.0,3.0
12162,Anna Waronker,John & Maria - Anna Waronker,7552.0,4432.0,9804.0,9632.0,3965.0,9391.0,13378.0,103.0,6855.0,5853.0,10680.0,103.0,4.0
8714,Anna Waronker,Goodbye - Anna Waronker,7464.0,9670.0,15902.0,11101.0,5948.0,14698.0,18355.0,158.0,6365.0,12657.0,10897.0,158.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13678,Sloppy Seconds,Lonely Christmas - Sloppy Seconds,5120.0,23012.0,17874.0,4869.0,19634.0,18105.0,12884.0,21485.0,13697.0,24344.0,7466.0,4869.0,103.0
6475,Sloppy Seconds,Ejaculation - Sloppy Seconds,24990.0,24927.0,25535.0,25395.0,5000.0,25798.0,22349.0,10049.0,22854.0,24164.0,23672.0,5000.0,104.0
27488,Sloppy Seconds,Yuppies - Sloppy Seconds,11519.0,20177.0,14477.0,5408.0,17755.0,13234.0,7319.0,20217.0,12965.0,21562.0,9128.0,5408.0,105.0
23408,Sloppy Seconds,The Men - Sloppy Seconds,23137.0,16179.0,15746.0,17087.0,8989.0,10883.0,14255.0,9167.0,17307.0,6887.0,20827.0,6887.0,106.0


In [32]:
# drop the track_id already ranked by artist frequency for artists identified in df_user_apparent
user_rank = user_rank.drop(user_rank_artist.index)

# sort values of initial ranking by minimum found for each centroid
user_rank['min_centroids'] = user_rank.iloc[:,2:-2].min(axis=1)
user_rank = user_rank.sort_values(by='min_centroids')
user_rank

Unnamed: 0,artist_name,track_id,0,1,2,3,4,5,6,7,8,9,10,min_centroids,artist_rank
22902,Josh Ritter,The Curse - Josh Ritter,1.0,23782.0,21868.0,13031.0,23696.0,21299.0,22335.0,22518.0,18943.0,22531.0,13376.0,1.0,
23850,Skid Row,The Threat - Skid Row,18493.0,1.0,410.0,4479.0,6692.0,4790.0,2309.0,7195.0,2428.0,10041.0,2286.0,1.0,
2320,Jordan Rudess,Beyond Tomorrow - Jordan Rudess,13627.0,13845.0,17728.0,15189.0,1.0,18315.0,11936.0,7303.0,7111.0,16105.0,12864.0,1.0,
8217,Butthole Surfers,Gary Floyd [live] - Butthole Surfers,27039.0,22367.0,13812.0,22723.0,23296.0,24361.0,1.0,26918.0,23605.0,26580.0,4570.0,1.0,
3921,Sade,Cherry Pie - Sade,15240.0,8935.0,11965.0,15924.0,13626.0,5519.0,18815.0,5401.0,15546.0,1.0,19949.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13655,Brian Regan,Log Trucks - Brian Regan,27568.0,27569.0,27570.0,27575.0,27545.0,27576.0,27544.0,27532.0,27571.0,27563.0,27555.0,27532.0,
11670,Paul Horn,Introduction - Paul Horn,27558.0,27583.0,27583.0,27582.0,27583.0,27583.0,27584.0,27538.0,27583.0,27584.0,27582.0,27538.0,
14949,Jackie Martling,Mismatches & Memories - Jackie Martling,27579.0,27573.0,27566.0,27577.0,27574.0,27574.0,27546.0,27574.0,27577.0,27560.0,27564.0,27546.0,
11979,Will Smith,Jaden's Interlude - Will Smith,27583.0,27581.0,27579.0,27581.0,27582.0,27582.0,27560.0,27583.0,27581.0,27583.0,27576.0,27560.0,


In [33]:
## attribute new ranks to artist_rank based on the length of user_rank_artist
user_rank['artist_rank'] = user_rank['min_centroids'].rank(method='first') + len(user_rank_artist)
user_rank

Unnamed: 0,artist_name,track_id,0,1,2,3,4,5,6,7,8,9,10,min_centroids,artist_rank
22902,Josh Ritter,The Curse - Josh Ritter,1.0,23782.0,21868.0,13031.0,23696.0,21299.0,22335.0,22518.0,18943.0,22531.0,13376.0,1.0,108.0
23850,Skid Row,The Threat - Skid Row,18493.0,1.0,410.0,4479.0,6692.0,4790.0,2309.0,7195.0,2428.0,10041.0,2286.0,1.0,109.0
2320,Jordan Rudess,Beyond Tomorrow - Jordan Rudess,13627.0,13845.0,17728.0,15189.0,1.0,18315.0,11936.0,7303.0,7111.0,16105.0,12864.0,1.0,110.0
8217,Butthole Surfers,Gary Floyd [live] - Butthole Surfers,27039.0,22367.0,13812.0,22723.0,23296.0,24361.0,1.0,26918.0,23605.0,26580.0,4570.0,1.0,111.0
3921,Sade,Cherry Pie - Sade,15240.0,8935.0,11965.0,15924.0,13626.0,5519.0,18815.0,5401.0,15546.0,1.0,19949.0,1.0,112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13655,Brian Regan,Log Trucks - Brian Regan,27568.0,27569.0,27570.0,27575.0,27545.0,27576.0,27544.0,27532.0,27571.0,27563.0,27555.0,27532.0,27580.0
11670,Paul Horn,Introduction - Paul Horn,27558.0,27583.0,27583.0,27582.0,27583.0,27583.0,27584.0,27538.0,27583.0,27584.0,27582.0,27538.0,27581.0
14949,Jackie Martling,Mismatches & Memories - Jackie Martling,27579.0,27573.0,27566.0,27577.0,27574.0,27574.0,27546.0,27574.0,27577.0,27560.0,27564.0,27546.0,27582.0
11979,Will Smith,Jaden's Interlude - Will Smith,27583.0,27581.0,27579.0,27581.0,27582.0,27582.0,27560.0,27583.0,27581.0,27583.0,27576.0,27560.0,27583.0


In [34]:
## obtain combination of user_rank_artist and user_rank
full_ranks = pd.concat([user_rank_artist, user_rank])
full_ranks

Unnamed: 0,artist_name,track_id,0,1,2,3,4,5,6,7,8,9,10,min_centroids,artist_rank
24343,Dwarves,Throw That World Away - Dwarves,22425.0,15363.0,2739.0,5199.0,23353.0,19282.0,365.0,25808.0,10678.0,25939.0,6.0,6.0,1.0
27216,Dwarves,You Gotta Burn - Dwarves,8642.0,2676.0,3733.0,1875.0,3804.0,5001.0,8591.0,3186.0,435.0,10175.0,4211.0,435.0,2.0
9713,Dwarves,Hits - Dwarves,15136.0,21137.0,17085.0,10249.0,11908.0,22434.0,1533.0,22935.0,11476.0,25946.0,1975.0,1533.0,3.0
12162,Anna Waronker,John & Maria - Anna Waronker,7552.0,4432.0,9804.0,9632.0,3965.0,9391.0,13378.0,103.0,6855.0,5853.0,10680.0,103.0,4.0
8714,Anna Waronker,Goodbye - Anna Waronker,7464.0,9670.0,15902.0,11101.0,5948.0,14698.0,18355.0,158.0,6365.0,12657.0,10897.0,158.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13655,Brian Regan,Log Trucks - Brian Regan,27568.0,27569.0,27570.0,27575.0,27545.0,27576.0,27544.0,27532.0,27571.0,27563.0,27555.0,27532.0,27580.0
11670,Paul Horn,Introduction - Paul Horn,27558.0,27583.0,27583.0,27582.0,27583.0,27583.0,27584.0,27538.0,27583.0,27584.0,27582.0,27538.0,27581.0
14949,Jackie Martling,Mismatches & Memories - Jackie Martling,27579.0,27573.0,27566.0,27577.0,27574.0,27574.0,27546.0,27574.0,27577.0,27560.0,27564.0,27546.0,27582.0
11979,Will Smith,Jaden's Interlude - Will Smith,27583.0,27581.0,27579.0,27581.0,27582.0,27582.0,27560.0,27583.0,27581.0,27583.0,27576.0,27560.0,27583.0


In [35]:
full_ranks[full_ranks['track_id'].isin(df_user_hidden['track_id'])]

Unnamed: 0,artist_name,track_id,0,1,2,3,4,5,6,7,8,9,10,min_centroids,artist_rank
24343,Dwarves,Throw That World Away - Dwarves,22425.0,15363.0,2739.0,5199.0,23353.0,19282.0,365.0,25808.0,10678.0,25939.0,6.0,6.0,1.0
27216,Dwarves,You Gotta Burn - Dwarves,8642.0,2676.0,3733.0,1875.0,3804.0,5001.0,8591.0,3186.0,435.0,10175.0,4211.0,435.0,2.0
9713,Dwarves,Hits - Dwarves,15136.0,21137.0,17085.0,10249.0,11908.0,22434.0,1533.0,22935.0,11476.0,25946.0,1975.0,1533.0,3.0
16369,Anna Waronker,Nothing Personal - Anna Waronker,11866.0,2600.0,4333.0,6505.0,5559.0,4960.0,4682.0,1781.0,6671.0,6431.0,8086.0,1781.0,12.0
10725,Anna Waronker,I Wish You Well - Anna Waronker,7444.0,3860.0,5611.0,4673.0,8836.0,2238.0,13686.0,2110.0,5546.0,4482.0,10071.0,2110.0,13.0
6663,Hot Chocolate,Emma - Hot Chocolate,11398.0,8066.0,11478.0,13347.0,11266.0,5095.0,18567.0,8329.0,13463.0,115.0,18771.0,115.0,1356.0


In [36]:
df_user_apparent[df_user_apparent['artist_name'] == 'The Wonders']

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
4009100,22e08d5e101ab5b86dc394856d508e175a5242a6,212,I Need You (That Thing You Do),That Thing You Do! Original Motion Picture Sou...,The Wonders,172.43383,0.539927,0.389694,1996,-1,...,1,0.0278,0.00174,0.0,0.154,0.654,141.872,4.0,1996-09-24,I Need You (That Thing You Do) - The Wonders
3672619,22e08d5e101ab5b86dc394856d508e175a5242a6,496,Dance With Me Tonight,That Thing You Do! Original Motion Picture Sou...,The Wonders,125.23057,0.539927,0.389694,1996,-1,...,0,0.0292,0.0996,0.0,0.283,0.968,142.061,4.0,1996-09-24,Dance With Me Tonight - The Wonders
2768037,22e08d5e101ab5b86dc394856d508e175a5242a6,130,That Thing You Do!,That Thing You Do! Original Motion Picture Sou...,The Wonders,166.76526,0.539927,0.389694,1996,-1,...,1,0.0432,0.0446,6e-06,0.094,0.962,134.52,4.0,1996-09-24,That Thing You Do! - The Wonders
3598454,22e08d5e101ab5b86dc394856d508e175a5242a6,55,All My Only Dreams,That Thing You Do! Original Motion Picture Sou...,The Wonders,174.18404,0.539927,0.389694,1996,-1,...,1,0.0253,0.0512,0.0,0.223,0.542,112.976,4.0,1996-09-24,All My Only Dreams - The Wonders
3827667,22e08d5e101ab5b86dc394856d508e175a5242a6,149,Little Wild One,That Thing You Do! Original Motion Picture Sou...,The Wonders,150.46485,0.539927,0.389694,1996,-1,...,1,0.0323,0.014,0.0,0.438,0.802,131.499,4.0,1996-09-24,Little Wild One - The Wonders


In [37]:
tracks_feats[tracks_feats['artist_name'] == 'The Wonders'].drop_duplicates(subset=['song_name', 'artist_name'])

Unnamed: 0,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,shs_work,explicit,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
2766875,That Thing You Do!,That Thing You Do! Original Motion Picture Sou...,The Wonders,166.76526,0.539927,0.389694,1996,-1,0,False,...,1,0.0432,0.0446,6e-06,0.094,0.962,134.52,4.0,1996-09-24,That Thing You Do! - The Wonders
3598331,All My Only Dreams,That Thing You Do! Original Motion Picture Sou...,The Wonders,174.18404,0.539927,0.389694,1996,-1,0,False,...,1,0.0253,0.0512,0.0,0.223,0.542,112.976,4.0,1996-09-24,All My Only Dreams - The Wonders
3672433,Dance With Me Tonight,That Thing You Do! Original Motion Picture Sou...,The Wonders,125.23057,0.539927,0.389694,1996,-1,0,False,...,0,0.0292,0.0996,0.0,0.283,0.968,142.061,4.0,1996-09-24,Dance With Me Tonight - The Wonders
3827528,Little Wild One,That Thing You Do! Original Motion Picture Sou...,The Wonders,150.46485,0.539927,0.389694,1996,-1,0,False,...,1,0.0323,0.014,0.0,0.438,0.802,131.499,4.0,1996-09-24,Little Wild One - The Wonders
4009030,I Need You (That Thing You Do),That Thing You Do! Original Motion Picture Sou...,The Wonders,172.43383,0.539927,0.389694,1996,-1,0,False,...,1,0.0278,0.00174,0.0,0.154,0.654,141.872,4.0,1996-09-24,I Need You (That Thing You Do) - The Wonders
