In [107]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipynb
from scipy.spatial import distance_matrix
import random
import sys as sys
import time
import re

In [34]:
from ipynb.fs.full.datamanagement import user_filter, summarise_listening_history
from ipynb.fs.full.evaluation_workflow import split_history, scoring_accuracy_vs_serendipity
from ipynb.fs.full.feature_based_algos import get_listenings_history_partition, kmeans_based_knn


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.spatial import distance_matrix

ImportError: cannot import name 'scoring_accuracy_vs_serendipity' from 'ipynb.fs.full.evaluation_workflow' (unknown location)

## Data loading

In [3]:
# Read the data
CUR_DIR = os.path.abspath('')

DATA_DIR = Path(CUR_DIR).parent / "data"
FILE_PATH = DATA_DIR/'triplets_metadata_spotify.csv'
df = pd.read_csv(FILE_PATH, index_col = 0)

In [4]:
df['track_id'] = df.song_name + '-' + df.artist_name

In [5]:
len(df)

4982520

## Data management

In [6]:
## keep first occurence of a given song/artist instance to reduce redundant observations

df = df.drop_duplicates(subset = ['user','track_id'], keep='first')
len(df)

4973744

### Users filtering

In [9]:
users_summary = summarise_listening_history(df)

In [10]:
ids = user_filter(users_summary, nlist_min = 1, nlist_max = 500, ntracks_min = 5)

In [85]:
wdf = df.loc[df['user'].isin(ids),:]
wdf_users_summary = users_summary.loc[users_summary['user'].isin(ids),:]

In [12]:
print('Nb of ligns in df  :', len(df))
print('Nb of ligns in wdf :', len(wdf))

Nb of ligns in df  : 4973744
Nb of ligns in wdf : 3856377


In [13]:
len(wdf_users_summary)

327081

In [14]:
wdf_users_summary.head()

Unnamed: 0,user,listening_count,track_count
7,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10
11,0000bb531aaa657c932988bc2f7fd7fc1b2050ec,14,10
12,0000d3c803e068cf1da17724f1674897b2dd7130,7,5
14,0000f88f8d76a238c251450913b0d070e4a77d19,30,8
15,000138e252eea35fd73aaf66a9b34102b695a9c8,26,13


### Songs management

In [15]:
# Get the songs quantitative features

tracks_feats = df.drop(['user', 'listening_count'],axis = 1)
tracks_feats['track_id'] = df.song_name + '-' + df.artist_name
tracks_quanti_feats = tracks_feats.drop(['song_name', 'release', 'artist_name',
        'artist_familiarity', 'artist_hotttnesss', 'year', 'key',
       'shs_perf', 'shs_work', 'explicit', 'mode', 'time_signature', 'release_date', 'duration', 'tempo'], axis = 1).drop_duplicates()

In [16]:
# Normalize the songs quantitative feats
X = tracks_quanti_feats.drop(['track_id'], axis = 1)
X.index = tracks_quanti_feats.track_id

scaler = StandardScaler()
X[X.columns] = pd.DataFrame(scaler.fit_transform(X), index=X.index)


## Recommendation algorithms evaluation

### Split the data into hidden and apparent sets

In [97]:
wdf_sorted = wdf.sort_values('user')
wdf_users_summary_sorted = wdf_users_summary.sort_values('user')

In [21]:
# Try 1; Triplets non sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 1:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 1: 42.95786738395691 seconds


In [22]:
# Try 2: Triplets non sorted by users, with users_summary
st = time.time()
test1, test2 = split_history(wdf, 0.2, sort = True, users_summary = wdf_users_summary, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 2:', elapsed_time, 'seconds')

Sorting the triplets by users id ... Done
Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 36.215386390686035 seconds


In [23]:
# Try 3: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 3:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 2: 21.979166746139526 seconds


In [18]:
# Try 4: Triplets sorted by users, without users_summary
st = time.time()
test1, test2 = split_history(wdf_sorted, 0.2, sort = False, users_summary = wdf_users_summary_sorted, random_state = 1234)
et = time.time()
elapsed_time = et - st
print('Execution time 4:', elapsed_time, 'seconds')


Computing the number of tracks listened by the users ... Done
Defining hidden tracks ids ... Done
Defining apparent tracks ids ... Done
Splitting the listening history ...Execution time 4: 17.047485828399658 seconds


### Definition of the grid of parameters

In [44]:
n_neighbors = [10, 25, 50, 100, 150, 200]

### Initialize the table of results  

In [130]:
# results = pd.DataFrame({
#     'user' : wdf_users_summary_sorted.user,
#     'listening_count' : wdf_users_summary_sorted.listening_count,
#     'track_count' :  wdf_users_summary_sorted.track_count
# })

# for r in ['n_reco-'+ str(n)+ '-' + i  for n in n_neighbors for i in ("reco_in","reco_out")]:
#     results[r] = None
# results = results.melt(id_vars = ['user', 'listening_count', 'track_count']).sort_values('user')
# results['n_reco'] =  results.variable.str.extract(r'([0-9]+)').astype('int')
# results['reco_type'] =  results.variable.str.extract(r'(reco_.+)')
# results = results.drop(['variable'], axis = 1)
# results.to_csv('../data/reco_res.csv')

In [143]:
results = pd.read_csv('../data/reco_res.csv')

In [144]:
results.head()

Unnamed: 0.1,Unnamed: 0,user,listening_count,track_count,value,n_reco,reco_type
0,0,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10,,10,reco_in
1,654162,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10,,25,reco_in
2,1308324,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10,,50,reco_in
3,3597891,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10,,200,reco_out
4,1635405,00007ed2509128dcdd74ea3aac2363e24e9dc06b,11,10,,50,reco_out


### Random recommandations

In [145]:
def random_k_reco(triplets, list_tracks, k):
    users = triplets.user.unique()
    reco = np.concatenate([random.sample(list(set(list_tracks) - set(triplets.loc[triplets.user == i, :].track_id)), k) for i in users])
    return pd.DataFrame({'user' : [i for i in users for j in range(k)],'track_id' : reco})

In [262]:
random.seed = 1234
sample = random.sample(list(test1.user.unique()), 500)

In [263]:
df_apparent_sample = test1.loc[test1.user.isin(sample),: ]
df_hidden_sample = test2.loc[test2.user.isin(sample),: ]

In [264]:
len(df_hidden_sample.user.unique())

500

In [265]:
reco = random_k_reco(df_apparent_sample, X.index, n_neighbors[3])

In [267]:
scores = scoring_in_out(df_hidden_sample, reco)

In [269]:
scores.reco_in.value_counts()

0    496
1      4
Name: reco_in, dtype: int64

In [171]:
scores = scoring_in_out(df_hidden_sample, reco)
scores['algo'] = 'random'
scores['seed'] = 1234

In [173]:
results = wdf_users_summary_sorted.merge(scores, how = 'inner', on = ['user'])

In [261]:
results.reco_in.value_counts()

0    998
1      2
Name: reco_in, dtype: int64

In [166]:
 results.to_csv('../data/reco_res.csv')

## Work in progress

In [270]:
reco_kmbknn = pd.DataFrame(
    {i : kmeans_based_knn(
        listenings_history = df_apparent_sample.loc[df_apparent_sample.user == i],
        X = X,
        n_neighbors = n_neighbors[3],
        random_state = 1234) for i in sample}
    ).melt(var_name = 'user', value_name = 'track_id')

In [271]:
scores = scoring_in_out(df_hidden_sample, reco_kmbknn)

In [272]:
scores.reco_in.value_counts()


0    495
1      5
Name: reco_in, dtype: int64

In [234]:
scores

Unnamed: 0,user,n_hidden,n_reco,reco_in,reco_out
0,db43a6fc4bd252b85938fc024f5691c46f11b475,100,100,100,0
1,d1d35f195219f8d4b7b73601d14119cde3407bf2,100,100,100,0
2,7c6e809a1d63849b45bc68b014370929b23a774b,100,100,100,0
3,b38fcb307c695261eb96d7b0e8ab1aaa8f6f3e38,100,100,100,0
4,2b1617c9753e283e0bd262e8c8fb2873fbfbd3a0,100,100,100,0
...,...,...,...,...,...
95,3e45cf7b26d97c63be6c5abc0cd135d1020aa52a,100,100,100,0
96,e07bfe0d565c58360251a7db6f28df14a32e1113,100,100,100,0
97,c4cd8da5275e778d2c9f2fe697369a8779eb39fc,100,100,100,0
98,813ad50b6c25731e25c530d381bc9dad2b31a6b1,100,100,100,0
