In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import time

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

In [3]:
from ipynb.fs.full.user_playlist_utils import user_filter, summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history

In [4]:
def get_listenings_history_partition(listenings_history_feats, listenings_history, max_clusters = 15, random_state = None):
    s_scores = []
    labels = []
    centroids = []
    n_clusters_range = np.arange(2, min(max_clusters, len(listenings_history_feats) - 1))
    
    # Search the optimal number of clusters
    for i in n_clusters_range:
        clf = KMeans(n_clusters = i, n_init = "auto", random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight= listenings_history.listening_count)
        centroids.append(clf.cluster_centers_)
        labels.append(clf.labels_)
        score = silhouette_score(listenings_history_feats, labels[i-2] , metric='euclidean') 
        s_scores.append(score)
    
    # Define the optimal number of clusters from the silhouette score
    i_clusters_opt = s_scores.index(max(s_scores))
    n_cluster_opt = i_clusters_opt +2 

    # Return the corresponding partition, 
    return labels[i_clusters_opt], centroids[i_clusters_opt], s_scores[i_clusters_opt], n_cluster_opt

In [5]:

def kmeans_based_knn(listenings_history, X, n_neighbors, weighted = False, n_clusters ='Default',  random_state = None):  
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    _, _ , _, n_clust_opt =  get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ , n_clust_opt = get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clust_opt, n_init = "auto", random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight=listenings_history.listening_count)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    cluster_size = pd.Series(labels).value_counts()
    nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    if not sum(nb_tracks_by_clusters) == n_neighbors:
        nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

    # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

    # Get the n_neighbors unique recommended tracks
    recommended_tracks = []
    for i, n in enumerate(nb_tracks_by_clusters):
        tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
        recommended_tracks = recommended_tracks + tracks[:n]

    return(recommended_tracks)


In [6]:

def kmeans_based_ranking(listenings_history, X,  weighted = False, n_clusters="Default",  random_state = None):  
    """
    Define the centroid(s) representing the average audio features of the tracks listened by a user.
    Estimate the affinity of the user for tracks that he didn't listened based on their distance with this (these) centroid(s).
    Return the rank of the unlistened tracks based on this afinity.
    
    Parameters:
        listenings_history (pandas.Dataframe): a dataframe whose columns are
            - user : a unique id of the user
            
            - track_id : a unique id for a track
            
            - listening_count: the number of times the user has listened to the track
            
            ... : other columns corresponding to track's features and/or the user's features and/or element of context of the interaction user/track.
        
        X (pandas.Dataframe): a dataframe corresponding to the audio_features of the tracks.
        
        weighted (boolean, default = False): determines whether the centroid calculation is weighted by the listens number of the tracks.    
         
        n_clusters (int or 'auto'): define the number of centroids to form. If 'auto', it is determine as the arg max of the silhouette score.
        
        
        random_state (int, default=None): pass an int for reproducible output across multiple function calls.
        
    Return :
       R : the ranks of each tracks.
    """
    
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    
    _, _ , _, n_clust_opt =  get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ ,n_clust_opt = get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    elif n_clusters == 'Default':
        clf = KMeans(n_clusters = n_clust_opt, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight=listenings_history.listening_count)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    # cluster_size = pd.Series(labels).value_counts()
    # nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    # if not sum(nb_tracks_by_clusters) == n_neighbors:
    #     nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

#     # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

#     # Get the n_neighbors unique recommended tracks
#     recommended_tracks = []
#     for i, n in enumerate(nb_tracks_by_clusters):
#         tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
#         recommended_tracks = recommended_tracks + tracks[:n]

    return R


In [8]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data" / "millionsong"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [9]:
df['track_id'] = df.song_name + ' - ' + df.artist_name

In [10]:
## keep first occurence of a given song/artist instance to reduce redundant observations does not apply to the full df but to tracks_feats
#df = df.drop_duplicates(subset = ['song_name', 'artist_name'], keep='first').reset_index()

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

### Users filtering

In [11]:
users_summary = summarise_listening_history(df)

In [12]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 3000, ntracks_min = 10)

In [13]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [14]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + ' - ' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration'], axis = 1).drop_duplicates() # , 'tempo'

In [15]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)

In [16]:
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Stronger - Kanye West,0.667073,0.244482,0.234685,0.971940,-0.825165,-0.623569,1.661334,0.055057,-0.585837
Stacked Actors - Foo Fighters,-0.048441,1.150060,0.862181,-0.294877,-0.842482,-0.620462,0.105924,0.728483,0.441564
Clarity - John Mayer,0.868312,0.283685,1.220943,-0.411504,-0.316887,-0.526080,-0.197701,0.039664,-0.893749
Gimme Stitches - Foo Fighters,0.180747,1.244146,1.071496,-0.108876,-0.843348,-0.614856,0.260399,0.197438,-0.324244
Breakout - Foo Fighters,-0.501227,1.165741,1.072171,-0.272758,-0.848599,-0.614704,-0.480018,-0.206617,1.169095
...,...,...,...,...,...,...,...,...,...
Dime - Jerry Rivera,0.991291,-0.429800,-0.219056,-0.392401,1.575591,-0.623569,0.015369,1.247983,-1.122880
The Long Conversation - Pierre de Reeder,-0.395018,-1.370660,-0.236161,-0.523105,1.705467,-0.520901,0.825034,-1.007032,-0.805067
That's the Way That It Was - Pierre de Reeder,0.583224,0.228801,0.703957,-0.425580,-0.754445,-0.623329,-0.543939,-0.679940,0.473561
I'll Close My Eyes - Mike Jones,-0.283219,-2.126092,-3.399071,-0.445688,2.187863,2.115258,3.328605,-0.899284,-1.731493


### Split the data into hidden and apparent sets

In [17]:
wdf_sorted = wdf.sort_values('listening_count', ascending=False)
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')
wdf_users_summary_sorted['listening_track_ratio'] = wdf_users_summary_sorted['listening_count'] / wdf_users_summary_sorted['track_count']

In [18]:
wdf_users_summary_sorted = wdf_users_summary_sorted.sort_values(by=['track_count'], ascending=False)
wdf_users_summary_sorted.head(50)

Unnamed: 0,user,listening_count,track_count,listening_track_ratio
784321,ec6dfcf19485cb011e0b22637075037aae34cf26,378,303,1.247525
259535,4e73d9e058d2b1f2dba9c1fe4a8f416f9f58364f,697,207,3.36715
599585,b4c94d72b15d3c311c10045a58b31f95d9d12785,535,180,2.972222
429569,8192a1d3ca0afbbcf1c98d7062d573e9207078e6,230,178,1.292135
362560,6d625c6557df84b60d90426c0116138b617b9449,936,176,5.318182
494367,952130c8ae0527458c75471812dbb5c506b46d5d,367,176,2.085227
466383,8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc,290,175,1.657143
845758,fef771ab021c200187a419f5e55311390f850a50,270,172,1.569767
675841,cbc7bddbe3b2f59fdbe031b3c8d0db4175d361e6,266,170,1.564706
379263,726da71c2c2ea119119a7957517fccd028d1be76,300,162,1.851852


In [19]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 9.61426043510437 seconds


In [20]:
n_neighbors = [10, 25, 50, 100, 150, 200]
sample = list(wdf_users_summary_sorted.loc[wdf_users_summary_sorted.track_count > 100].user)

In [21]:
#user_name = '22e08d5e101ab5b86dc394856d508e175a5242a6'
user_name = '005a475315cd3a29638cf242c4b7c71194e81642'

#user_name = sample[6]

df_apparent_sample = test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]


In [22]:
df_user_apparent = df_apparent_sample.loc[df_apparent_sample.user == '005a475315cd3a29638cf242c4b7c71194e81642']
df_user_hidden = df_hidden_sample.loc[df_hidden_sample.user == '005a475315cd3a29638cf242c4b7c71194e81642']



In [23]:

user_rank = kmeans_based_ranking(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == '005a475315cd3a29638cf242c4b7c71194e81642'], 
        X = X,
        #n_neighbors = n_neighbors[3],
        random_state = 123)

user_rank = user_rank.reset_index()


In [24]:
def rank_by_artist_filter(initial_ranking, apparent_history: pd.DataFrame):
    
    initial_ranking.insert(0, 'artist_name', initial_ranking['track_id'].str.split(' - ').str[1])
    initial_ranking['min_centroids'] =  np.nan
    initial_ranking['artist_rank'] =  np.nan
    # get the artists and sort them by proportion in df_user_apparent  
    user_artists = (apparent_history.groupby('artist_name')['track_id'].count()/len(apparent_history)).sort_values(ascending=False)


    # create a blank df with same columns as user_rank
    # calculate the minimum between centroids for each track of artists identified in df_user_apparent
    # sort them by this minimum distance
    # attribute them a new rank from 1 to len(track_ids) of each artist identifies in order of their importance in df_user_apparent 
    user_rank_artist = initial_ranking.drop(index = initial_ranking.index, axis=0)
    for i, artist in enumerate(user_artists.index):
        user_temp = initial_ranking[initial_ranking['artist_name'] == user_artists.index[i]]
        user_temp['min_centroids'] = user_temp.iloc[:, 2:-2].min(axis=1)
        user_temp = user_temp.sort_values(by='min_centroids')
        user_temp['artist_rank'] = user_temp['min_centroids'].rank(method = 'first', ascending=True) + len(user_rank_artist)
        user_rank_artist = pd.concat([user_rank_artist,user_temp])


    # in original ranking, drop tracks already reranked by artist frequency for artists identified in df_user_apparent
    initial_ranking = initial_ranking.drop(user_rank_artist.index)

    # sort values of initial ranking by minimum found for each centroid
    initial_ranking['min_centroids'] = initial_ranking.iloc[:,2:-2].min(axis=1)
    initial_ranking = initial_ranking.sort_values(by='min_centroids')
    # attribute them new ranks based on the length of user_rank_artist to get a single rank by track/artist
    initial_ranking['artist_rank'] = initial_ranking['min_centroids'].rank(method='first') + len(user_rank_artist)   

    # combine new ranking based on artists identified and initial ranks reranked as a continuity of artist based ranks
    full_ranks = pd.concat([user_rank_artist, initial_ranking])
    full_ranks['best_rank'] = full_ranks.iloc[:, -2:].min(axis=1)

    return full_ranks

In [25]:
user_rank

Unnamed: 0,track_id,0,1
0,$ VS. Entertainment - Askeleton,18724.0,11593.0
1,$1000 Wedding - Gram Parsons,17022.0,13474.0
2,$35 - The Aluminum Group,19518.0,22840.0
3,'94 Abyss - Shitmat,11536.0,19642.0
4,'Dozer Rage - Latterman,7276.0,11106.0
...,...,...,...
27516,É Papa Ré - Santana,745.0,13950.0
27517,Émigré - James Dean Bradfield,502.0,9720.0
27518,Ñapaes - Ska-P,9411.0,5684.0
27519,Översättning - Ayo,14521.0,2070.0


In [26]:
full_ranks = rank_by_artist_filter(initial_ranking = user_rank, apparent_history = df_user_apparent)

In [32]:
full_ranks.head(20)

Unnamed: 0,artist_name,track_id,0,1,min_centroids,artist_rank,best_rank
5041,Cake,Daria - Cake,6842.0,109.0,109.0,1.0,1.0
18251,Cake,Race Car Ya-Yas - Cake,7376.0,133.0,133.0,2.0,2.0
11910,Cake,Italian Leather Sofa - Cake,5773.0,710.0,710.0,3.0,3.0
5506,Cake,Dime - Cake,16199.0,1408.0,1408.0,4.0,4.0
4374,Cake,Comfort Eagle - Cake,13891.0,1426.0,1426.0,5.0,5.0
24672,Cake,Tougher Than It Is - Cake,17788.0,1689.0,1689.0,6.0,6.0
16888,Cake,Opera Singer - Cake,18564.0,1882.0,1882.0,7.0,7.0
25779,Cake,War Pigs - Cake,10354.0,2649.0,2649.0,8.0,8.0
16125,Cake,No Phone - Cake,3290.0,11196.0,3290.0,9.0,9.0
11703,Cake,Is This Love? - Cake,16089.0,4698.0,4698.0,10.0,10.0


In [33]:
#df_user_apparent[df_user_apparent["artist_name"]=='Cake']
df_user_hidden[df_user_hidden["artist_name"]=='Cake']

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
1959715,005a475315cd3a29638cf242c4b7c71194e81642,2,Hem Of Your Garment,Prolonging the Magic,Cake,215.90159,0.794331,0.650666,1998,-1,...,1,0.0336,0.0138,0.237,0.0668,0.955,119.085,4.0,1998-02-05,Hem Of Your Garment - Cake
236335,005a475315cd3a29638cf242c4b7c71194e81642,2,Italian Leather Sofa,Fashion Nugget,Cake,351.39873,0.794331,0.650666,1996,-1,...,1,0.0409,0.307,0.00857,0.3,0.492,103.987,4.0,1996-09-17,Italian Leather Sofa - Cake
189643,005a475315cd3a29638cf242c4b7c71194e81642,2,Opera Singer,Comfort Eagle,Cake,248.29342,0.794331,0.650666,2001,-1,...,1,0.0412,0.0619,0.0193,0.0606,0.953,94.986,4.0,2001-07-23,Opera Singer - Cake
3148131,005a475315cd3a29638cf242c4b7c71194e81642,1,War Pigs,b-sides and rarities,Cake,234.00444,0.794331,0.650666,2007,68165,...,0,0.0357,0.00698,3e-06,0.0968,0.449,83.904,4.0,2007-11-13,War Pigs - Cake
1963991,005a475315cd3a29638cf242c4b7c71194e81642,2,Dime,Pressure Chief,Cake,217.49506,0.794331,0.650666,2004,-1,...,1,0.027,0.509,0.00886,0.203,0.888,114.002,4.0,2004-10-04,Dime - Cake
3682611,005a475315cd3a29638cf242c4b7c71194e81642,2,Tougher Than It Is,Pressure Chief,Cake,180.21832,0.794331,0.650666,2004,-1,...,0,0.0849,0.0724,3.3e-05,0.155,0.904,97.937,4.0,2004-10-04,Tougher Than It Is - Cake
1973504,005a475315cd3a29638cf242c4b7c71194e81642,2,Is This Love?,Motorcade Of Generosity,Cake,199.54893,0.794331,0.650666,1994,-1,...,1,0.0398,0.773,0.0,0.135,0.793,105.265,4.0,1994,Is This Love? - Cake
3153007,005a475315cd3a29638cf242c4b7c71194e81642,1,Strangers In The Night,Stubbs The Zombie: The Soundtrack,Cake,170.31791,0.794331,0.650666,2004,35297,...,0,0.0247,0.304,6e-06,0.485,0.628,90.022,4.0,2007-11-13,Strangers In The Night - Cake
668206,005a475315cd3a29638cf242c4b7c71194e81642,2,Comfort Eagle,Comfort Eagle,Cake,220.23791,0.794331,0.650666,2001,-1,...,1,0.0528,0.09,0.0,0.211,0.899,120.987,4.0,2001-07-23,Comfort Eagle - Cake
989670,005a475315cd3a29638cf242c4b7c71194e81642,2,No Phone,Pressure Chief,Cake,232.01914,0.794331,0.650666,2004,-1,...,0,0.0907,0.137,0.000221,0.0881,0.775,181.839,4.0,2004-10-04,No Phone - Cake


In [36]:
print("Daria from 'Cake'is neither in the apparent nor the hidden playlist but is still recommended as the first rank")
print("The rest of the tracks, from artist importance to initial track ranking are well captured by the algorithm as suggestions for the hidden playlist while assigned ranks are performant")
full_ranks[full_ranks['track_id'].isin(df_user_hidden['track_id'])]

Daria from 'Cake'is neither in the apparent nor the hidden playlist but is still recommended as the first rank
The rest of the tracks, from artist importance to initial track ranking are well captured and ranked by the algorithm as suggestions for the hidden playlist


Unnamed: 0,artist_name,track_id,0,1,min_centroids,artist_rank,best_rank
18251,Cake,Race Car Ya-Yas - Cake,7376.0,133.0,133.0,2.0,2.0
11910,Cake,Italian Leather Sofa - Cake,5773.0,710.0,710.0,3.0,3.0
5506,Cake,Dime - Cake,16199.0,1408.0,1408.0,4.0,4.0
4374,Cake,Comfort Eagle - Cake,13891.0,1426.0,1426.0,5.0,5.0
24672,Cake,Tougher Than It Is - Cake,17788.0,1689.0,1689.0,6.0,6.0
16888,Cake,Opera Singer - Cake,18564.0,1882.0,1882.0,7.0,7.0
25779,Cake,War Pigs - Cake,10354.0,2649.0,2649.0,8.0,8.0
16125,Cake,No Phone - Cake,3290.0,11196.0,3290.0,9.0,9.0
11703,Cake,Is This Love? - Cake,16089.0,4698.0,4698.0,10.0,10.0
27205,Cake,You Part the Waters - Cake,16496.0,7835.0,7835.0,11.0,11.0


In [210]:
'''
import dataframe_image as dfi
dfi.export(full_ranks[full_ranks['track_id'].isin(df_user_hidden['track_id'])], 'user_10.png')
'''