In [1]:
import pandas as pd
import numpy as np

In [2]:
plays = pd.read_csv('Datas/lastfm/user_artists.dat', sep='\t') ## Dataset of relation between an artist 
## an user and the number listening 
artists = pd.read_csv('Datas/lastfm/artists.dat', sep='\t', usecols=['id','name']) ## id	name	url	pictureURL

# Merge artist and user pref data
ap = pd.merge(artists, plays, how="inner", left_on="id", right_on="artistID")
ap = ap.rename(columns={"weight": "playCount"})
#print(ap)
# Group artist by name
artist_rank = ap.groupby(['name']) \
    .agg({'userID' : 'count', 'playCount' : 'sum'}) \
    .rename(columns={"userID" : 'totalUsers', "playCount" : "totalPlays"}) \
    .sort_values(['totalPlays'], ascending=False)

artist_rank['avgPlays'] = artist_rank['totalPlays'] / artist_rank['totalUsers']
print(artist_rank)

                    totalUsers  totalPlays     avgPlays
name                                                   
Britney Spears             522     2393140  4584.559387
Depeche Mode               282     1301308  4614.567376
Lady Gaga                  611     1291387  2113.563011
Christina Aguilera         407     1058405  2600.503686
Paramore                   399      963449  2414.659148
...                        ...         ...          ...
Morris                       1           1     1.000000
Eddie Kendricks              1           1     1.000000
Excess Pressure              1           1     1.000000
My Mine                      1           1     1.000000
A.M. Architect               1           1     1.000000

[17632 rows x 3 columns]


In [3]:
# Merge into ap matrix

ap = ap.join(artist_rank, on="name", how="inner") \
    .sort_values(['playCount'], ascending=False)
#print(ap)

# Preprocessing
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)
#print(ap)

# Build a user-artist rating matrix 
ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
ratings = ratings_df.fillna(0).values
#print(ratings)

# Show sparsity
sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100
print("density: %.2f %%" % sparsity)

density: 0.28 %


In [4]:
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
'''selon le type de matrice sparse, la compresion n est pas la même'''

# Build a sparse matrix
X = csr_matrix(ratings)
#print(X)

n_users, n_items = ratings_df.shape
print("rating matrix shape", ratings_df.shape)

user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

rating matrix shape (1892, 17632)


In [5]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset

# Build data references + train test
Xcoo = X.tocoo()
# initialise le dataset du type (triplet)
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
train, test = random_train_test_split(interactions)

# Ignore that (weight seems to be ignored...)
#train = train_.tocsr()
#test = test_.tocsr()
#train[train==1] = X[train==1]
#test[test==1] = X[test==1]

# To be completed...

In [6]:
# Train
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f5c471bb8b0>

In [7]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.37, test 0.13.
AUC: train 0.96, test 0.86.


In [8]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Depeche Mode' 'The Killers' 'Coldplay' ... 'HAKUEI' 'Nano' 'Hi:BRiD']


## Choix de la meilleur fonction de coût

In [9]:
import time
from collections import defaultdict

loss_val = ['logistic', 'bpr', 'warp', 'warp-kos']

def append_res_list(res, name, time, prec, recall, AUC):
    res[name].append([
                time,
                prec,  
                recall,           
                AUC,           
            ])
    return res

train_res = defaultdict(list)
test_res = defaultdict(list)
for loss in loss_val:
    name = 'loss-'+loss
    
    model = LightFM(learning_rate=0.05, loss=loss)
    t0 = time.time()
    model.fit(train, epochs=10, num_threads=2)
    fit_time = time.time() - t0
    
    ## precision
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()
    
    ## recal
    train_recall = recall_at_k(model, train, k=10).mean()
    test_recall = recall_at_k(model, test, k=10, train_interactions=train).mean()
    
    ## AUC
    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test, train_interactions=train).mean()
    
    ## resultats summury
    
    train_res = append_res_list(train_res, name, fit_time, 
                         train_precision, train_recall, train_auc)
    

    test_res = append_res_list(test_res, name, fit_time, 
                         test_precision, test_recall, test_auc)

In [10]:
def print_df_resultats(res):
    # Compute mean and std
    final = {}
    for model in res:
        arr = np.array(res[model])
        final[model] = {
            "time (s)" : arr[:,0].mean().round(3),
            "precision ": arr[:,1].mean().round(3),
            "recall": arr[:,2].mean().round(3),
            "AUC": arr[:,3].mean().round(3),
        }

    df = pd.DataFrame.from_dict(final, orient="index").round(3)
    return df

In [11]:
df_train = print_df_resultats(train_res)
df_train

Unnamed: 0,time (s),precision,recall,AUC
loss-logistic,0.405,0.196,0.051,0.888
loss-bpr,0.509,0.369,0.095,0.855
loss-warp,0.557,0.375,0.097,0.964
loss-warp-kos,1.156,0.326,0.084,0.887


In [12]:
df_test = print_df_resultats(test_res)
df_test

Unnamed: 0,time (s),precision,recall,AUC
loss-logistic,0.405,0.065,0.067,0.807
loss-bpr,0.509,0.124,0.128,0.78
loss-warp,0.557,0.13,0.133,0.854
loss-warp-kos,1.156,0.115,0.118,0.815


Le meilleur modèle terme de précision, recall et AUC est lorsque l'on choisi la fonction loss = 'wrap'. C'est vrai pour le training et le testing set.

## Test d'optimisation d'hyper-paramètres avec GreadSearchCV

Je ne suis pas sûre que ce soit compatible avec les modèles lightfm mais le jeu en vaut la chandèle pour pouvoir optimiser tous les paramètre avec une grille de recherche.

In [13]:
## affichage des paramètre 
LightFM().get_params().keys()

dict_keys(['loss', 'learning_schedule', 'no_components', 'learning_rate', 'k', 'n', 'rho', 'epsilon', 'max_sampled', 'item_alpha', 'user_alpha', 'random_state'])

In [14]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

AUC_scorer = make_scorer(auc_score)

grid_params = {
    'loss' : ['logistic', 'bpr', 'warp', 'warp-kos'],
    'no_components' : [1, 2, 5, 10, 20],
    'learning_rate' : [0.1, 5e-2, 1e-2, 1e-3],
    'k' : [1, 2, 5, 10, 20],
} 

gs = GridSearchCV(LightFM(),
                  grid_params,
                  cv = 5,
                  scoring=AUC_scorer,
                  n_jobs = -1
                 )

In [15]:
gs_res = gs.fit(train)

TypeError: _score() missing 1 required positional argument: 'y_true'

## Fonction de recommandation
On va entreiner le modèle avec fonction loss = 'warp'

In [16]:
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7f5c471bb520>

In [17]:
def get_recommandation(user, model=model):
    scores = model.predict(user, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    print('Liste des 5 artistes recommandés :', top_items[:10])
    return top_items

user = 1
top_items = get_recommandation(user)

Liste des 5 artistes recommandés : ['Bonobo' 'Apparat' 'Amethystium' 'Autechre' 'Solar Fields' 'Ochre'
 'Boards of Canada' 'múm' 'Four Tet' 'Pleq']
