In [1]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

In [3]:
from ipynb.fs.full.datamanagement import user_filter, summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history

In [4]:
def get_listenings_history_partition(listenings_history_feats, listenings_history, max_clusters = 15, random_state = None):
    s_scores = []
    labels = []
    centroids = []
    n_clusters_range = np.arange(2, min(max_clusters, len(listenings_history_feats) - 1))
    
    # Search the optimal number of clusters
    for i in n_clusters_range:
        clf = KMeans(n_clusters = i, n_init = "auto", random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight= listenings_history.listening_count)
        centroids.append(clf.cluster_centers_)
        labels.append(clf.labels_)
        score = silhouette_score(listenings_history_feats, labels[i-2] , metric='euclidean') 
        s_scores.append(score)
    
    # Define the optimal number of clusters from the silhouette score
    i_clusters_opt = s_scores.index(max(s_scores))
    n_cluster_opt = i_clusters_opt +2 

    # Return the corresponding partition, 
    return labels[i_clusters_opt], centroids[i_clusters_opt], s_scores[i_clusters_opt], n_cluster_opt

In [5]:

def kmeans_based_knn(listenings_history, X, n_neighbors, weighted = False, n_clusters ='Default',  random_state = None):  
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    _, _ , _, n_clust_opt =  get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ , n_clust_opt = get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clust_opt, n_init = "auto", random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight=listenings_history.listening_count)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    cluster_size = pd.Series(labels).value_counts()
    nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    if not sum(nb_tracks_by_clusters) == n_neighbors:
        nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

    # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

    # Get the n_neighbors unique recommended tracks
    recommended_tracks = []
    for i, n in enumerate(nb_tracks_by_clusters):
        tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
        recommended_tracks = recommended_tracks + tracks[:n]

    return(recommended_tracks)


In [6]:

def kmeans_based_ranking(listenings_history, X,  weighted = False, n_clusters="Default",  random_state = None):  
    """
    Define the centroid(s) representing the average audio features of the tracks listened by a user.
    Estimate the affinity of the user for tracks that he didn't listened based on their distance with this (these) centroid(s).
    Return the rank of the unlistened tracks based on this afinity.
    
    Parameters:
        listening_history (pandas.Dataframe): a dataframe whose columns are
            - user : a unique id of the user
            
            - track_id : a unique id for a track
            
            - listening_count: the number of times the user has listened to the track
            
            ... : other columns corresponding to track's features and/or the user's features and/or element of context of the interaction user/track.
        
        X (pandas.Dataframe): a dataframe corresponding to the audio_features of the tracks.
        
        weighted (boolean, default = True): determines whether the centroid calculation is weighted by the listens number of the tracks.    
         
        n_clusters (int or 'auto'): define the number of centroids to form. If 'auto', it is determine as the arg max of the silhouette score.
        
        
        random_state (int, default=None): pass an int for reproducible output across multiple function calls.
        
    Return :
       R : the ranks of each tracks.
    """
    
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    
    _, _ , _, n_clust_opt =  get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ ,n_clust_opt = get_listenings_history_partition(listenings_history_feats, listenings_history, random_state = random_state)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    elif n_clusters == 'Default':
        clf = KMeans(n_clusters = n_clust_opt, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats, sample_weight=listenings_history.listening_count)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    # cluster_size = pd.Series(labels).value_counts()
    # nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    # if not sum(nb_tracks_by_clusters) == n_neighbors:
    #     nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

#     # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

#     # Get the n_neighbors unique recommended tracks
#     recommended_tracks = []
#     for i, n in enumerate(nb_tracks_by_clusters):
#         tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
#         recommended_tracks = recommended_tracks + tracks[:n]

    return R


In [7]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [8]:
df['track_id'] = df.song_name + ' - ' + df.artist_name

In [9]:
## keep first occurence of a given song/artist instance to reduce redundant observations does not apply to the full df but to tracks_feats
#df = df.drop_duplicates(subset = ['song_name', 'artist_name'], keep='first').reset_index()

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

### Users filtering

In [10]:
users_summary = summarise_listening_history(df)

In [11]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 3000, ntracks_min = 10)

In [12]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [13]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + ' - ' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration'], axis = 1).drop_duplicates() # , 'tempo'

In [14]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)

In [15]:
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Stronger - Kanye West,0.667073,0.244482,0.234685,0.971940,-0.825165,-0.623569,1.661334,0.055057,-0.585837
Stacked Actors - Foo Fighters,-0.048441,1.150060,0.862181,-0.294877,-0.842482,-0.620462,0.105924,0.728483,0.441564
Clarity - John Mayer,0.868312,0.283685,1.220943,-0.411504,-0.316887,-0.526080,-0.197701,0.039664,-0.893749
Gimme Stitches - Foo Fighters,0.180747,1.244146,1.071496,-0.108876,-0.843348,-0.614856,0.260399,0.197438,-0.324244
Breakout - Foo Fighters,-0.501227,1.165741,1.072171,-0.272758,-0.848599,-0.614704,-0.480018,-0.206617,1.169095
...,...,...,...,...,...,...,...,...,...
Dime - Jerry Rivera,0.991291,-0.429800,-0.219056,-0.392401,1.575591,-0.623569,0.015369,1.247983,-1.122880
The Long Conversation - Pierre de Reeder,-0.395018,-1.370660,-0.236161,-0.523105,1.705467,-0.520901,0.825034,-1.007032,-0.805067
That's the Way That It Was - Pierre de Reeder,0.583224,0.228801,0.703957,-0.425580,-0.754445,-0.623329,-0.543939,-0.679940,0.473561
I'll Close My Eyes - Mike Jones,-0.283219,-2.126092,-3.399071,-0.445688,2.187863,2.115258,3.328605,-0.899284,-1.731493


### Split the data into hidden and apparent sets

In [16]:
wdf_sorted = wdf.sort_values('listening_count', ascending=False)
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')
wdf_users_summary_sorted['listening_track_ratio'] = wdf_users_summary_sorted['listening_count'] / wdf_users_summary_sorted['track_count']

In [17]:
wdf_users_summary_sorted = wdf_users_summary_sorted.sort_values(by=['listening_track_ratio', 'track_count'], ascending=False)
wdf_users_summary_sorted.head(50)

Unnamed: 0,user,listening_count,track_count,listening_track_ratio
375966,716ed1ec67d67bfa05db3ffeb641d13f46dca6ec,1143,10,114.3
352324,6a46aee45cc177cf8e2025e59d21c7939902deee,1379,16,86.1875
580888,af3ee32357049dd96231238bd1b019e8142ee6aa,1553,19,81.736842
224432,43e241eaa4d079ab2bdbd30231695f5563583af4,727,11,66.090909
210372,3fa44653315697f42410a30cb766a4eb102080bb,1506,25,60.24
114964,22e08d5e101ab5b86dc394856d508e175a5242a6,1603,29,55.275862
690540,d035c4a2b179ef8c756d73f9f3018ec7a87d8594,548,10,54.8
514797,9b3b5c53b64cbe8f36a5ec11aad76c09cbf3e57d,689,13,53.0
781689,eba8c4d803d654fef4cda5d8f4f70f5acea918a1,757,15,50.466667
96195,1d36c586adf56c1be114fb173bf23ececa79e9c7,750,15,50.0


In [18]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 9.307382345199585 seconds


In [285]:
n_neighbors = [10, 25, 50, 100, 150, 200]
sample = list(wdf_users_summary_sorted.loc[wdf_users_summary_sorted.track_count > 100].user)

In [382]:
#user_name = '22e08d5e101ab5b86dc394856d508e175a5242a6'
#user_name = '005a475315cd3a29638cf242c4b7c71194e81642'

user_name = sample[6]

df_apparent_sample = test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]


In [383]:
df_user_apparent = df_apparent_sample.loc[df_apparent_sample.user == user_name]
df_user_hidden = df_hidden_sample.loc[df_hidden_sample.user == user_name]



In [384]:

user_rank = kmeans_based_ranking(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == user_name], 
        X = X,
        #n_neighbors = n_neighbors[3],
        random_state = 123)

user_rank = user_rank.reset_index()


In [385]:
def rank_by_artist_filter(initial_ranking, apparent_history: pd.DataFrame):
    
    initial_ranking.insert(0, 'artist_name', initial_ranking['track_id'].str.split(' - ').str[1])
    initial_ranking['min_centroids'] =  np.nan
    initial_ranking['artist_rank'] =  np.nan
    # get the artists and sort them by proportion in df_user_apparent  
    user_artists = (apparent_history.groupby('artist_name')['track_id'].count()/len(apparent_history)).sort_values(ascending=False)


    # create a blank df with same columns as user_rank
    # calculate the minimum between centroids for each track of artists identified in df_user_apparent
    # sort them by this minimum distance
    # attribute them a new rank from 1 to len(track_ids) of each artist identifies in order of their importance in df_user_apparent 
    user_rank_artist = initial_ranking.drop(index = initial_ranking.index, axis=0)
    for i, artist in enumerate(user_artists.index):
        user_temp = initial_ranking[initial_ranking['artist_name'] == user_artists.index[i]]
        user_temp['min_centroids'] = user_temp.iloc[:, 2:-2].min(axis=1)
        user_temp = user_temp.sort_values(by='min_centroids')
        user_temp['artist_rank'] = user_temp['min_centroids'].rank(method = 'first', ascending=True) + len(user_rank_artist)
        user_rank_artist = pd.concat([user_rank_artist,user_temp])


    # in original ranking, drop tracks already reranked by artist frequency for artists identified in df_user_apparent
    initial_ranking = initial_ranking.drop(user_rank_artist.index)

    # sort values of initial ranking by minimum found for each centroid
    initial_ranking['min_centroids'] = initial_ranking.iloc[:,2:-2].min(axis=1)
    initial_ranking = initial_ranking.sort_values(by='min_centroids')
    # attribute them new ranks based on the length of user_rank_artist to get a single rank by track/artist
    initial_ranking['artist_rank'] = initial_ranking['min_centroids'].rank(method='first') + len(user_rank_artist)   

    # combine new ranking based on artists identified and initial ranks reranked as a continuity of artist based ranks
    full_ranks = pd.concat([user_rank_artist, initial_ranking])
    full_ranks['best_rank'] = full_ranks.iloc[:, -2:].min(axis=1)

    return full_ranks

In [386]:
full_ranks = rank_by_artist_filter(initial_ranking = user_rank, apparent_history = df_user_apparent)

In [387]:
full_ranks

Unnamed: 0,artist_name,track_id,0,1,min_centroids,artist_rank,best_rank
8033,Woods,From the Horn - Woods,873.0,11398.0,873.0,1.0,1.0
5882,Woods,Don't Pass On Me - Woods,16740.0,1081.0,1081.0,2.0,2.0
17506,Woods,Pick Up - Woods,18927.0,1956.0,1956.0,3.0,3.0
12413,Woods,Keep It On - Woods,18462.0,3898.0,3898.0,4.0,4.0
25703,Woods,Walk The Dogs - Woods,22995.0,4231.0,4231.0,5.0,5.0
...,...,...,...,...,...,...,...
18828,George Lopez,Right Now Right Now?... Later Later! - George ...,27510.0,27516.0,27510.0,27514.0,27510.0
14918,Jackie Martling,Mismatches & Memories - Jackie Martling,27511.0,27511.0,27511.0,27515.0,27511.0
20729,George Lopez,Socoro - George Lopez,27512.0,27512.0,27512.0,27516.0,27512.0
11625,Swell,Intro - Swell,27517.0,27515.0,27515.0,27517.0,27515.0


In [388]:
full_ranks[full_ranks['track_id'].isin(df_user_hidden['track_id'])]

Unnamed: 0,artist_name,track_id,0,1,min_centroids,artist_rank,best_rank
8033,Woods,From the Horn - Woods,873.0,11398.0,873.0,1.0,1.0
17506,Woods,Pick Up - Woods,18927.0,1956.0,1956.0,3.0,3.0
12413,Woods,Keep It On - Woods,18462.0,3898.0,3898.0,4.0,4.0
2633,Woods,Blood Dries Darker - Woods,5536.0,5111.0,5111.0,7.0,7.0
2798,Woods,Bone Tapper - Woods,26253.0,16973.0,16973.0,21.0,21.0
12463,The Antlers,Kettering - The Antlers,25479.0,1320.0,1320.0,24.0,24.0
20820,Beach House,Some Things Last a Long Time - Beach House,9448.0,285.0,285.0,25.0,25.0
14020,Beach House,Lovelier Girl - Beach House,25698.0,768.0,768.0,26.0,26.0
24518,Beach House,Tokyo Witch - Beach House,16165.0,4781.0,4781.0,29.0,29.0
896,Beach House,All The Years - Beach House,23186.0,6269.0,6269.0,30.0,30.0


In [210]:
'''
import dataframe_image as dfi
dfi.export(full_ranks[full_ranks['track_id'].isin(df_user_hidden['track_id'])], 'user_10.png')
'''