In [1]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipynb
from scipy.spatial import distance_matrix
import random
import sys as sys
import time

In [209]:
from ipynb.fs.full.datamanagement import user_filter
from ipynb.fs.full.datamanagement import summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history
from ipynb.fs.full.feature_based_algos import get_listenings_history_partition, kmeans_based_knn


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

## Data loading

In [3]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [4]:
df['track_id'] = df.song_name + '-' + df.artist_name

In [5]:
len(df)

4982520

## Data management

In [5]:
## keep first occurence of a given song/artist instance to reduce redundant observations

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

### Users filtering

In [6]:
users_summary = summarise_listening_history(df)

In [7]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 500, ntracks_min = 5)

In [8]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [9]:
print('Nb of ligns in df  :', len(df))
print('Nb of ligns in wdf :', len(wdf))

Nb of ligns in df  : 4973744
Nb of ligns in wdf : 3856377


In [10]:
len(wdf_users_summary)

327081

In [11]:
wdf_users_summary.head()

Unnamed: 0,user,listening_count,track_count
7,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10
11,0000bb531aaa657c932988bc2f7fd7fc1b2050ec,14,10
12,0000d3c803e068cf1da17724f1674897b2dd7130,7,5
14,0000f88f8d76a238c251450913b0d070e4a77d19,30,8
15,000138e252eea35fd73aaf66a9b34102b695a9c8,26,13


### Songs management

In [12]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + '-' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration', 'tempo'], axis = 1).drop_duplicates()

In [13]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)


## Recommendation algorithms evaluation

### Split the data into hidden and apparent sets

In [14]:
wdf_sorted = wdf.sort_values('user')
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')

In [21]:
# Try 1; Triplets non sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 1:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 1: 42.95786738395691 seconds


In [22]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 36.215386390686035 seconds


In [23]:
# Try 3: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 3:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 21.979166746139526 seconds


In [15]:
# Try 4: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, users_summary = wdf_users_summary_sorted, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 4:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 4: 9.531891107559204 seconds


### Definition of the grid of parameters

### Random recommandations

In [18]:
def random_k_reco(triplets, list_tracks, k):
    users = triplets.user.unique()
    reco = np.concatenate([random.sample(list(set(list_tracks) - set(triplets.loc[triplets.user == i, :].track_id)), k) for i in users])
    return pd.DataFrame({'user' : [i for i in users for j in range(k)],'track_id' : reco})

In [21]:
sample = random.sample(list(test1.user.unique()), 1000)

['9c6816187c7a22b9823256336f735a1d60aeb1e2',
 'da5bfabbcd52dd210f7c503c2d1950d1b86952bc',
 '106e6363fad53f217ccc7f4285032366e47f8752',
 '5574e47ddf35df140ed00b15334b6b6e6b9a1b24',
 'a5a44e66d1293f3d218f04e4c23be75a422bb77e',
 '5a92d6ead4e6ab8d0efc241a5d0c0409c3cfc494',
 '0bb3843d20c7d5fdfe40f9457b79babe5f56b811',
 '76827f9241949507fbcda65ca9cd00fc95c0b3ba',
 'bd8b85eda3bfdc48dddfd0ee1e69c6087d3f62e8',
 'e13e38e5c730f32fdcc15440c52e84d52e964741']

In [22]:
df_apparent_sample =  test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]

In [23]:
reco = random_k_reco(df_apparent_sample, X.index, 10)

In [24]:
reco

Unnamed: 0,user,track_id
0,00096d38be8c339e7f0b74d52f4653eb8b31fbdc,CP24-Woodhands
1,00096d38be8c339e7f0b74d52f4653eb8b31fbdc,Flowing-Teenage Fanclub
2,00096d38be8c339e7f0b74d52f4653eb8b31fbdc,Drifter-Probspot
3,00096d38be8c339e7f0b74d52f4653eb8b31fbdc,I Let Go-Eighteen Visions
4,00096d38be8c339e7f0b74d52f4653eb8b31fbdc,New Patches-Mel Tillis
...,...,...
9995,ffbba5704849e25b74cb08435c5f3bf276a8df93,Who Can I Say You Are-Morly Grey
9996,ffbba5704849e25b74cb08435c5f3bf276a8df93,Stutter-Lake Trout
9997,ffbba5704849e25b74cb08435c5f3bf276a8df93,Prime Mover-Steve Stevens
9998,ffbba5704849e25b74cb08435c5f3bf276a8df93,Come Together-Third Day


In [130]:
def scoring_accuracy_vs_serendipity(hidden_triplets, recommended_triplets):
    accuracy = [1-(len(set(hidden_triplets.loc[hidden_triplets.user == i,:].track_id)-set(recommended_triplets.loc[recommended_triplets.user == i,:].track_id)))/(len(set(hidden_triplets.loc[hidden_triplets.user == i,:].track_id))) for i in hidden_triplets.user.unique()]
    serendipity = [(len(set(recommended_triplets.loc[recommended_triplets.user == i,:].track_id) - set(hidden_triplets.loc[hidden_triplets.user == i,:].track_id)))/(len(set(recommended_triplets.loc[recommended_triplets.user == i,:].track_id)))for i in hidden_triplets.user.unique()]
    return accuracy, serendipity

In [131]:
acc, ser = scoring_accuracy_vs_serendipity(df_hidden_sample, reco)

In [139]:
pd.Series(acc).describe()

count    1000.000000
mean        0.033309
std         0.140897
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
dtype: float64

## Work in progress

In [203]:
listenings_history = test1.loc[test1.user.isin([sample[100]]),: ]

In [204]:
listenings_history

Unnamed: 0,user,listening_count,song_name,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,shs_perf,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,release_date,track_id
373736,db6ca1554d54eb9589ee367368053a3429c697b3,1,Take It In,One Life Stand,Hot Chip,249.65179,0.852639,0.535763,2009,-1,...,1,0.0349,0.0351,0.0976,0.121,0.39,120.114,4.0,2010-02-01,Take It In-Hot Chip
502060,db6ca1554d54eb9589ee367368053a3429c697b3,1,Plasticities,Live at Austin City Limits Music Festival 2007...,Andrew Bird,428.95628,0.757952,0.503813,2007,-1,...,1,0.0288,0.463,0.0124,0.114,0.678,146.838,4.0,2007,Plasticities-Andrew Bird
445965,db6ca1554d54eb9589ee367368053a3429c697b3,1,Someone Great,Someone Great,LCD Soundsystem,389.48526,0.764882,0.614346,2007,-1,...,1,0.0325,0.0418,0.00374,0.11,0.396,113.003,4.0,2019-06-07,Someone Great-LCD Soundsystem
4177345,db6ca1554d54eb9589ee367368053a3429c697b3,1,Love Spreads,Second Coming,The Stone Roses,347.14077,0.725522,0.478417,1994,-1,...,1,0.0481,0.00375,0.0029,0.585,0.553,92.367,4.0,2002-11-04,Love Spreads-The Stone Roses
1070303,db6ca1554d54eb9589ee367368053a3429c697b3,1,The Laws Have Changed,Electric Version,The New Pornographers,206.81098,0.708589,0.523546,2003,-1,...,1,0.0385,0.00342,0.0,0.0811,0.688,141.943,4.0,2004-10-12,The Laws Have Changed-The New Pornographers
549310,db6ca1554d54eb9589ee367368053a3429c697b3,1,Heretics,Heretics,Andrew Bird,215.87546,0.757952,0.498424,2007,-1,...,1,0.0546,0.114,3e-05,0.117,0.496,131.697,4.0,2007,Heretics-Andrew Bird


In [211]:
kmeans_based_knn(listenings_history = user_test, X = X, n_neighbors = 10, random_state = 1234)

['I Am Destined For Greatness-We Versus The Shark',
 'Recovery-New Buffalo',
 'Arts Centre-The Chap',
 'Shambala-B.W. Stevenson',
 'Chicken Shack-Pinetop Perkins',
 'When I Meet Them-Seals and Crofts',
 'Three Hits-Indigo Girls',
 'Tell Him-Benton Falls',
 'The Race Is On Again-Yo La Tengo',
 'So Fine-Electric Light Orchestra']