In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

In [None]:
def get_listenings_history_partition(listenings_history_feats, max_clusters = 10, random_state = None):
    s_scores = []
    labels = []
    centroids = []
    n_clusters_range = np.arange(2, min(max_clusters, len(listenings_history_feats) - 1))
    
    # Search the optimal number of clusters
    for i in n_clusters_range:
        clf = KMeans(n_clusters = i, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats)
        centroids.append(clf.cluster_centers_)
        labels.append(clf.labels_)
        score = silhouette_score(listenings_history_feats, labels[i-2], metric='euclidean')
        s_scores.append(score)
    
    # Define the optimal number of clusters from the silhouette score
    i_clusters_opt = s_scores.index(max(s_scores))
    
    # Return the corresponding partition, 
    return labels[i_clusters_opt], centroids[i_clusters_opt], s_scores[i_clusters_opt]

In [None]:
def kmeans_based_knn(listenings_history, X, n_neighbors, weighted = True, n_clusters = 'auto', random_state = None):  
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ = get_listenings_history_partition(listenings_history_feats)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clusters, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    cluster_size = pd.Series(labels).value_counts()
    nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    if not sum(nb_tracks_by_clusters) == n_neighbors:
        nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

    # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

    # Get the n_neighbors unique recommended tracks
    recommended_tracks = []
    for i, n in enumerate(nb_tracks_by_clusters):
        tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
        recommended_tracks = recommended_tracks + tracks[:n]

    return(recommended_tracks)

In [None]:
def kmeans_based_ranking(listenings_history, X, weighted = True, n_clusters = 'auto', random_state = None):  
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ = get_listenings_history_partition(listenings_history_feats)
    elif n_clusters == 1:
        labels = list(np.zeros(len(listenings_history)))
        centroids = np.array(listenings_history_feats.apply('mean'), ndmin = 2)
    else:
        clf = KMeans(n_clusters = n_clusters, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    # cluster_size = pd.Series(labels).value_counts()
    # nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    # if not sum(nb_tracks_by_clusters) == n_neighbors:
    #     nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distance between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

#     # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

#     # Get the n_neighbors unique recommended tracks
#     recommended_tracks = []
#     for i, n in enumerate(nb_tracks_by_clusters):
#         tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
#         recommended_tracks = recommended_tracks + tracks[:n]

    return R