In [1]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipynb
from scipy.spatial import distance_matrix
import random
import sys as sys
import time
import re

In [2]:
from ipynb.fs.full.datamanagement import user_filter, summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history, scoring_accuracy_vs_serendipity, scoring_in_out
from ipynb.fs.full.feature_based_algos import get_listenings_history_partition, kmeans_based_knn


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

## Data loading

In [3]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [4]:
df['track_id'] = df.song_name + '-' + df.artist_name

In [5]:
len(df)

4982520

## Data management

In [6]:
## keep first occurence of a given song/artist instance to reduce redundant observations

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

### Users filtering

In [7]:
users_summary = summarise_listening_history(df)

In [8]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 500, ntracks_min = 5)

In [9]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [10]:
print('Nb of ligns in df  :', len(df))
print('Nb of ligns in wdf :', len(wdf))

Nb of ligns in df  : 4973744
Nb of ligns in wdf : 3856377


In [11]:
len(wdf_users_summary)

327081

In [12]:
wdf_users_summary.head()

Unnamed: 0,user,listening_count,track_count
7,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10
11,0000bb531aaa657c932988bc2f7fd7fc1b2050ec,14,10
12,0000d3c803e068cf1da17724f1674897b2dd7130,7,5
14,0000f88f8d76a238c251450913b0d070e4a77d19,30,8
15,000138e252eea35fd73aaf66a9b34102b695a9c8,26,13


### Songs management

In [13]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + '-' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration', 'tempo'], axis = 1).drop_duplicates()

In [14]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)


In [15]:
X

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Stronger-Kanye West,0.667073,0.244482,0.234685,0.971940,-0.825165,-0.623569,1.661334,0.055057
Stacked Actors-Foo Fighters,-0.048441,1.150060,0.862181,-0.294877,-0.842482,-0.620462,0.105924,0.728483
Clarity-John Mayer,0.868312,0.283685,1.220943,-0.411504,-0.316887,-0.526080,-0.197701,0.039664
Gimme Stitches-Foo Fighters,0.180747,1.244146,1.071496,-0.108876,-0.843348,-0.614856,0.260399,0.197438
Breakout-Foo Fighters,-0.501227,1.165741,1.072171,-0.272758,-0.848599,-0.614704,-0.480018,-0.206617
...,...,...,...,...,...,...,...,...
Dime-Jerry Rivera,0.991291,-0.429800,-0.219056,-0.392401,1.575591,-0.623569,0.015369,1.247983
The Long Conversation-Pierre de Reeder,-0.395018,-1.370660,-0.236161,-0.523105,1.705467,-0.520901,0.825034,-1.007032
That's the Way That It Was-Pierre de Reeder,0.583224,0.228801,0.703957,-0.425580,-0.754445,-0.623329,-0.543939,-0.679940
I'll Close My Eyes-Mike Jones,-0.283219,-2.126092,-3.399071,-0.445688,2.187863,2.115258,3.328605,-0.899284


## Recommendation algorithms evaluation

### Split the data into hidden and apparent sets

In [16]:
wdf_sorted = wdf.sort_values('user')
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')

In [21]:
# Try 1; Triplets non sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 1:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 1: 42.95786738395691 seconds


In [22]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 36.215386390686035 seconds


In [23]:
# Try 3: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 3:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 21.979166746139526 seconds


In [17]:
# Try 4: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, users_summary = wdf_users_summary_sorted, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('\n Execution time 4:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 4: 8.457518339157104 seconds


### Definition of the grid of parameters

In [30]:
# n_neighbors = [10, 25, 50, 100, 150, 200]

### Initialize the table of results  

In [130]:
# results = pd.DataFrame({
#     'user' : wdf_users_summary_sorted.user,
#     'listening_count' : wdf_users_summary_sorted.listening_count,
#     'track_count' :  wdf_users_summary_sorted.track_count
# })

# for r in ['n_reco-'+ str(n)+ '-' + i  for n in n_neighbors for i in ("reco_in","reco_out")]:
#     results[r] = None
# results = results.melt(id_vars = ['user', 'listening_count', 'track_count']).sort_values('user')
# results['n_reco'] =  results.variable.str.extract(r'([0-9]+)').astype('int')
# results['reco_type'] =  results.variable.str.extract(r'(reco_.+)')
# results = results.drop(['variable'], axis = 1)
# results.to_csv('../data/reco_res.csv')

In [31]:
results = pd.read_csv('../data/reco_res.csv')

In [32]:
results.head()

Unnamed: 0.1,Unnamed: 0,user,listening_count,track_count,n_hidden,n_reco,reco_in,reco_out
0,539,006efbbc4ab2dbc89a4b352282919c1d27ee7c74,19,12,2.0,10.0,0.0,10.0
1,981,00c2ffb64177c627f5fb3c9f101208123aaf298c,46,5,1.0,10.0,0.0,10.0
2,1107,00e090ae8c47db1be5f7061b01eddced92c3c435,19,11,2.0,10.0,0.0,10.0
3,1542,0139472c899f9b6eb9c4a1689f6fd78f1af1be13,12,8,2.0,10.0,0.0,10.0
4,1732,015c7500549e6c6bbd94a06fd2ca8baa7284784f,9,9,2.0,10.0,0.0,10.0


### Random recommandations

In [17]:
def random_k_reco(triplets, list_tracks, k, random_state):
    users = triplets.user.unique()
    reco = np.concatenate([random.sample(list(set(list_tracks) - set(triplets.loc[triplets.user == i, :].track_id)), k) for i in users])
    return pd.DataFrame({'user' : [i for i in users for j in range(k)],'track_id' : reco})

In [19]:
random.seed = 1234
sample = list(wdf_users_summary_sorted.loc[wdf_users_summary_sorted.track_count > 100].user)

In [27]:
rranks = pd.DataFrame({u : [i for i in random.sample(range(len(X)),len(X))] for u in sample})
rranks['tracks'] = X.index

In [28]:
rranks = rranks.melt(var_name = 'user', value_name = 'rank', id_vars = 'tracks')

Unnamed: 0,tracks,user,rank
0,Stronger-Kanye West,005a475315cd3a29638cf242c4b7c71194e81642,3137
1,Stacked Actors-Foo Fighters,005a475315cd3a29638cf242c4b7c71194e81642,19299
2,Clarity-John Mayer,005a475315cd3a29638cf242c4b7c71194e81642,4464
3,Gimme Stitches-Foo Fighters,005a475315cd3a29638cf242c4b7c71194e81642,20060
4,Breakout-Foo Fighters,005a475315cd3a29638cf242c4b7c71194e81642,14514
...,...,...,...
4803613,Dime-Jerry Rivera,fef771ab021c200187a419f5e55311390f850a50,41
4803614,The Long Conversation-Pierre de Reeder,fef771ab021c200187a419f5e55311390f850a50,23480
4803615,That's the Way That It Was-Pierre de Reeder,fef771ab021c200187a419f5e55311390f850a50,9786
4803616,I'll Close My Eyes-Mike Jones,fef771ab021c200187a419f5e55311390f850a50,18577


In [31]:
df_hidden_sample

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
1967312,005a475315cd3a29638cf242c4b7c71194e81642,2,Red Right Ankle,Her Majesty The Decemberists,The Decemberists,209.08363,0.638557,0.651676,2003,-1,...,1,0.0295,0.8720,0.000013,0.0895,0.452,92.576,4.0,2003,Red Right Ankle-The Decemberists
3155438,005a475315cd3a29638cf242c4b7c71194e81642,2,Haze of Love,Motorcade of Generosity,Cake,187.89832,0.794331,0.650666,1994,-1,...,1,0.0353,0.6100,0.000050,0.0807,0.775,105.327,4.0,1994,Haze of Love-Cake
1916274,005a475315cd3a29638cf242c4b7c71194e81642,1,Interlude (Milo),Good News For People Who Love Bad News,Modest Mouse,58.90567,0.808802,0.613120,2004,-1,...,1,0.0377,0.1700,0.877000,0.1010,0.340,89.658,4.0,2004-04-05,Interlude (Milo)-Modest Mouse
1901878,005a475315cd3a29638cf242c4b7c71194e81642,1,Never Ending Math Equation,Baron Von Bullshit Rides Again,Modest Mouse,218.85342,0.808802,0.613120,1998,61840,...,1,0.0644,0.0385,0.000013,0.0678,0.896,182.841,4.0,2000-01-18,Never Ending Math Equation-Modest Mouse
303118,005a475315cd3a29638cf242c4b7c71194e81642,2,The Bachelor and the Bride,Her Majesty The Decemberists,The Decemberists,252.81261,0.638557,0.651676,2003,-1,...,0,0.0364,0.0556,0.000008,0.1340,0.505,164.505,4.0,2003,The Bachelor and the Bride-The Decemberists
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4247138,fef771ab021c200187a419f5e55311390f850a50,1,Hidden Track,Somewhere To Elsewhere,Kansas,84.03546,0.762318,0.470917,2000,-1,...,0,0.1600,0.9930,0.879000,0.6780,0.901,114.778,4.0,2000,Hidden Track-Kansas
980607,fef771ab021c200187a419f5e55311390f850a50,1,Engwish Bwudd,Six Demon Bag,Man Man,213.83791,0.710877,0.402965,2005,-1,...,1,0.0307,0.0896,0.000000,0.7800,0.928,174.988,4.0,2006,Engwish Bwudd-Man Man
698934,fef771ab021c200187a419f5e55311390f850a50,1,'Round Midnight,Discover Miles Davis,Miles Davis,358.32118,0.684322,0.586716,0,-1,...,0,0.0467,0.7900,0.000368,0.1290,0.203,112.890,4.0,1957-03-18,'Round Midnight-Miles Davis
4452223,fef771ab021c200187a419f5e55311390f850a50,1,Blue Orchids,April,Sun Kil Moon,356.20526,0.755041,0.415472,2008,-1,...,0,0.0322,0.9780,0.182000,0.0980,0.321,80.612,4.0,2008-04-01,Blue Orchids-Sun Kil Moon


In [29]:
df_apparent_sample = test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]

In [48]:
reco = random_k_reco(df_apparent_sample, X.index, n_neighbors[3])

In [49]:
scores = scoring_in_out(df_hidden_sample, reco)

In [50]:
scores.reco_in.value_counts()

0    166
1      8
Name: reco_in, dtype: int64

In [51]:
scores

Unnamed: 0,user,n_hidden,n_reco,reco_in,reco_out
0,005a475315cd3a29638cf242c4b7c71194e81642,22,100,0,100
1,00b9144fc8cd453b81c362b30dbf8b8266eec220,32,100,0,100
2,05a82aa743db5b6c5965e72d458d7c0755fd9f0f,22,100,0,100
3,062eef2a03b53d2b10f5018135e3361659c6a3bf,27,100,0,100
4,070941445cba8e8a9157f1253116cc430c31a811,22,100,0,100
...,...,...,...,...,...
169,fab8d9648b537cd5c2af445a6a5218a8010751b7,21,100,0,100
170,fc6538bd1e2db0d960cf5c27e9cb0d5de2cddd22,26,100,0,100
171,fd2e8f7a73aceb8ef77e1fcfd58909e815ade83b,23,100,0,100
172,fe9a05c03c29da973743a83b80d1660748077432,25,100,0,100


In [171]:
scores = scoring_in_out(df_hidden_sample, reco)
scores['algo'] = 'random'
scores['seed'] = 1234

In [173]:
results = wdf_users_summary_sorted.merge(scores, how = 'inner', on = ['user'])

In [261]:
results.reco_in.value_counts()

0    998
1      2
Name: reco_in, dtype: int64

In [166]:
 results.to_csv('../data/reco_res.csv')

## Work in progress

In [41]:
rank_kmbknn, _, _ = pd.DataFrame(
    {i : kmeans_based_ranking(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == i],
        X = X,
        # n_neighbors = n_neighbors[3],
        random_state = 1234) for i in sample[0:1]}
    )

ValueError: not enough values to unpack (expected 3, got 1)

In [44]:
test, _, _ = kmeans_based_ranking(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == sample[0]],
        X = X,
        # n_neighbors = n_neighbors[3],
        random_state = 1234)

In [45]:
test

Unnamed: 0_level_0,0,1,2,3
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
$ VS. Entertainment-Askeleton,17128.0,17239.0,17968.0,17706.0
$1000 Wedding-Gram Parsons,8260.0,8188.0,9363.0,6839.0
$35-The Aluminum Group,22817.0,22813.0,23031.0,22409.0
'94 Abyss-Shitmat,13771.0,13758.0,13970.0,14381.0
'Dozer Rage-Latterman,13344.0,13289.0,11908.0,13010.0
...,...,...,...,...
É Papa Ré-Santana,1167.0,1115.0,1343.0,957.0
Émigré-James Dean Bradfield,4758.0,4666.0,4385.0,3835.0
Ñapaes-Ska-P,3812.0,3728.0,3490.0,4598.0
Översättning-Ayo,9262.0,9412.0,9045.0,9603.0


In [44]:
scores = scoring_in_out(df_hidden_sample, reco_kmbknn)

In [45]:
scores.reco_in.value_counts()


0    164
1      9
2      1
Name: reco_in, dtype: int64

In [93]:
hist = df_apparent_sample.loc[df_apparent_sample.user==sample[0]]

In [94]:
hist_feat = X.loc[X.index.isin(df_apparent_sample.loc[df_apparent_sample.user==sample[0]].track_id)]

In [99]:
labels, centroids, _ = get_listenings_history_partition(hist_feat, random_state = 1234)

In [100]:
labels

array([4, 1, 1, 1, 0, 4, 3, 0, 4, 4, 1, 4, 4, 2, 4, 4, 4, 1, 0, 3, 0, 0,
       0, 0, 0, 4, 2, 1, 4, 1, 0, 0, 4, 0, 1, 1, 4, 2, 4, 1, 4, 2, 4, 3,
       4, 1, 1, 0, 0, 0, 0, 2, 4, 2, 0, 1, 4, 4, 1, 3, 2, 2, 4, 2, 2, 4,
       2, 4, 4, 3, 2, 4, 2, 4, 3, 3, 0, 1, 4, 0, 2, 1, 0, 0, 0, 1])

In [101]:
D = pd.DataFrame(distance_matrix(X, centroids), index = X.index)

In [14]:
R = D.rank(axis = 0)

NameError: name 'D' is not defined

1967312                     Red Right Ankle-The Decemberists
3155438                                    Haze of Love-Cake
1916274                        Interlude (Milo)-Modest Mouse
1901878              Never Ending Math Equation-Modest Mouse
303118           The Bachelor and the Bride-The Decemberists
2307506                   Bombtrack-Rage Against The Machine
990753                               Pretty Pink Ribbon-Cake
191470                                  Race Car Ya-Yas-Cake
671531                                      Never There-Cake
286457                                Twilight Galaxy-Metric
3822274                                Take It All Away-Cake
2737154             The Train That Stole My Man-Two Gallants
3199826                     Where Eagles Dare-Sloppy Seconds
394854                             Your Touch-The Black Keys
862162          Killing In The Name-Rage Against The Machine
1404489                         Leavin' Trunk-The Black Keys
985515                  

In [18]:
print(R.loc[R.index.isin(df_hidden_sample.loc[df_hidden_sample.user==sample[0]].track_id),])

NameError: name 'R' is not defined

In [40]:
def kmeans_based_ranking(listenings_history, X, weighted = True, n_clusters = 'auto', random_state = None):  
    # Get the features of the songs listened by the user
    listenings_history_feats = X.loc[listenings_history.track_id]      
    
    # If a weighting of each tracks is desired :
    if weighted:
        # Calculation of the weight of each track according to its number of listens
        w = listenings_history.listening_count / listenings_history.listening_count.sum()
        w.index = listenings_history_feats.index 
        # Tracks weighting
        listenings_history_feats = listenings_history_feats.apply(lambda x: x*w)
    
    # Compute the user tracks centroids
    if n_clusters == 'auto':
        labels, centroids, _ = get_listenings_history_partition(listenings_history_feats)
    elif n_clusters == 1:
        labels = np.zeros(len(listenings_history))
        centroids = listenings_history_feats.apply('mean')
    else:
        clf = KMeans(n_clusters = n_clusters, n_init = 'auto', random_state = random_state)
        clf.fit(listenings_history_feats)
        centroids = clf.cluster_centers_
        labels = clf.labels_
    
    # Define the number of neighbors to find according to the clusters size
    # cluster_size = pd.Series(labels).value_counts()
    # nb_tracks_by_clusters = [round(n_neighbors * v) for v in cluster_size / sum(cluster_size)]
    # if not sum(nb_tracks_by_clusters) == n_neighbors:
    #     nb_tracks_by_clusters[-1] = n_neighbors - sum(nb_tracks_by_clusters[:-1])
        
    # Compute the distnce between the tracks and the centroids  
    D = pd.DataFrame(distance_matrix(X.loc[X.index.difference(listenings_history_feats.index)], centroids), index = X.index.difference(listenings_history_feats.index))           

#     # Get the ranks of the tracks relating to its distance with each centroid
    R = D.rank(axis = 0)

#     # Get the n_neighbors unique recommended tracks
#     recommended_tracks = []
#     for i, n in enumerate(nb_tracks_by_clusters):
#         tracks = [t for t in list(R.iloc[:,i].sort_values().index) if not t in recommended_tracks]
#         recommended_tracks = recommended_tracks + tracks[:n]

    return R, labels, centroids