# Bayesian Personalized Ranking

Il s'agit d'un algorithme implicite, c'est une approche qui donne les 'rankings' pour un jeu d'items pour un utilisateur spécifique.

BPR s'intéresse au triplet (u, i, j) avec ***u*** un utilisateur, ***i*** un item connu et ***j*** un item inconnu.

BPR utilise une formule bayésienne pour maximiser la probabilité postérieure. (Formule de Bayes)

---

Le critère d'optimisation final est le suivant :


![Texte alternatif…](https://drive.google.com/uc?id=1EJ5cSOEoKdlaCWg0SM0OEhvs7yH9LeGg)



### Modèle Tensorflow

In [89]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import pandas as pd
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm

Preparation des données

In [90]:
df = pd.read_csv('Datasets/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv', sep='\t')

In [91]:
df.head()

Unnamed: 0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691


In [92]:
df.drop(df.columns[1], axis=1, inplace=True)

In [93]:
df.columns = ['user', 'artist', 'plays']

In [94]:
df.dropna(inplace=True)

In [95]:
df['user_id'] = df['user'].astype('category').cat.codes
df['artist_id'] = df['artist'].astype("category").cat.codes

In [96]:
item_lookup = df[['artist_id', 'artist']].drop_duplicates()
item_lookup['artist_id'] = item_lookup["artist_id"].astype(str)

In [97]:
df.drop(['user', 'artist'], axis=1, inplace=True)

In [98]:
df = df.loc[df.plays != 0]

In [99]:
users = list(np.sort(df.user_id.unique()))
artists = list(np.sort(df.artist_id.unique()))
plays = list(df.plays)

In [100]:
rows = df.user_id.astype(float)
cols = df.artist_id.astype(float)

In [101]:
data_sparse = sp.csr_matrix((plays, (rows, cols)), shape=(len(users), len(artists)))

In [102]:
sparsity = 1 - data_sparse.nnz / (data_sparse.shape[0] * data_sparse.shape[1])
print(f'Sparsity : {sparsity*100:0.3f} %')

Sparsity : 0.017 %


In [103]:
uids, iids = data_sparse.nonzero()

Hyperparameters

In [104]:
epochs = 50
batches = 30
num_factors = 64 # Latent factors

# Regularization parameters
lambda_user = 0.0000001
lambda_item = 0.0000001
lambda_bias = 0.0000001

# Learning rate
lr = 0.005

# Triplets by batch
samples = 15000

Tensorflow Graph

In [105]:
graph = tf.Graph()

In [106]:
def init_variable(size, dim, name=None):
    '''
    Helper function to init a new variable with uniform random valuers
    '''
    std = np.sqrt(2/dim)
    return tf.Variable(tf.random_uniform([size, dim], -std, std), name=name)

In [107]:
def embed(inputs, size, dim, name=None):
    '''
    Helper function to get a Tensorflow variable and create an embedding lookup 
    in order to map our user and item indices to vector
    '''
    emb = init_variable(size, dim, name)
    return tf.nn.embedding_lookup(emb, inputs)

In [108]:
def get_variable(graph, session, name):
    '''
    Helper function to get the value of Tensorflow variable by name
    '''
    v = graph.get_operation_by_name(name)
    v = v.values()[0]
    v = v.eval(session=session)
    
    return v

In [138]:
with graph.as_default():
    '''
    Loss function: 
    -SUM ln σ(xui - xuj) + λ(w1)**2 + λ(w2)**2 + λ(w3)**2 ...
    ln = the natural log
    σ(xuij) = the sigmoid function of xuij.
    λ = lambda regularization value.
    ||W||**2 = the squared L2 norm of our model parameters.

    '''

    # Input into our model, in this case our user (u),
    # known item (i) an unknown item (i) triplets.
    u = tf.placeholder(tf.int32, shape=(None, 1))
    i = tf.placeholder(tf.int32, shape=(None, 1))
    j = tf.placeholder(tf.int32, shape=(None, 1))

    # User feature embedding
    u_factors = embed(u, len(users), num_factors, 'user_factors') # U matrix

    # Known and unknown item embeddings
    item_factors = init_variable(len(artists), num_factors, "item_factors") # V matrix
    i_factors = tf.nn.embedding_lookup(item_factors, i)
    j_factors = tf.nn.embedding_lookup(item_factors, j)

    # i and j bias embeddings.
    item_bias = init_variable(len(artists), 1, "item_bias")
    i_bias = tf.nn.embedding_lookup(item_bias, i)
    i_bias = tf.reshape(i_bias, [-1, 1])
    j_bias = tf.nn.embedding_lookup(item_bias, j)
    j_bias = tf.reshape(j_bias, [-1, 1])

    # Calculate the dot product + bias for known and unknown
    # item to get xui and xuj.
    xui = i_bias + tf.reduce_sum(u_factors * i_factors, axis=2)
    xuj = j_bias + tf.reduce_sum(u_factors * j_factors, axis=2)

    # We calculate xuij.
    xuij = xui - xuj

    # Calculate the mean AUC (area under curve).
    # if xuij is greater than 0, that means that 
    # xui is greater than xuj (and thats what we want).
    u_auc = tf.reduce_mean(tf.to_float(xuij > 0))

    # Output the AUC value to tensorboard for monitoring.
    tf.summary.scalar('auc', u_auc)

    # Calculate the squared L2 norm ||W||**2 multiplied by λ.
    l2_norm = tf.add_n([
        lambda_user * tf.reduce_sum(tf.multiply(u_factors, u_factors)),
        lambda_item * tf.reduce_sum(tf.multiply(i_factors, i_factors)),
        lambda_item * tf.reduce_sum(tf.multiply(j_factors, j_factors)),
        lambda_bias * tf.reduce_sum(tf.multiply(i_bias, i_bias)),
        lambda_bias * tf.reduce_sum(tf.multiply(j_bias, j_bias))
        ])

    # Calculate the loss as ||W||**2 - ln σ(Xuij)
    #loss = l2_norm - tf.reduce_mean(tf.log(tf.sigmoid(xuij)))
    loss = -tf.reduce_mean(tf.log(tf.sigmoid(xuij))) + l2_norm

    # Train using the Adam optimizer to minimize 
    # our loss function.
    opt = tf.train.AdamOptimizer(learning_rate=lr)
    step = opt.minimize(loss)

    # Initialize all tensorflow variables.
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()

Graph Execution

In [139]:
# Run the session. 
with tf.Session(graph=graph) as sess:

    # This has noting to do with tensorflow but gives
    # us a nice progress bar for the training.
    # progress = tqdm(total=batches*epochs)
    
    sess.run(init)
    
    for epoch in range(epochs):
        for _ in range(batches):

            # We want to sample one known and one unknown 
            # item for each user. 
            import random
            idx = random.sample(range(len(uids)), samples)

            batch_u = [[uids[idxx]] for idxx in idx]
            batch_i = [[iids[idxx]] for idxx in idx]

            idx = random.sample(range(len(artists)), samples)
            batch_j = [[idxx] for idxx in idx]

            # Feed our users, known and unknown items to
            # our tensorflow graph. 
            feed_dict = { u: batch_u, i: batch_i, j: batch_j }

            # We run the session.
            _, l, auc = sess.run([step, loss, u_auc], feed_dict)

        # progress.update(batches)
        print(' Epoch %d : Loss: %.3f | AUC: %.3f' % (epoch, l, auc))
        
    saver.save(sess, 'models/bpr-recommender-0.1')

# progress.close()

 Epoch 0 : Loss: 0.797 | AUC: 0.530
 Epoch 1 : Loss: 0.750 | AUC: 0.566
 Epoch 2 : Loss: 0.696 | AUC: 0.609
 Epoch 3 : Loss: 0.651 | AUC: 0.635
 Epoch 4 : Loss: 0.606 | AUC: 0.672
 Epoch 5 : Loss: 0.564 | AUC: 0.704
 Epoch 6 : Loss: 0.523 | AUC: 0.734
 Epoch 7 : Loss: 0.480 | AUC: 0.769
 Epoch 8 : Loss: 0.442 | AUC: 0.799
 Epoch 9 : Loss: 0.394 | AUC: 0.836
 Epoch 10 : Loss: 0.357 | AUC: 0.860
 Epoch 11 : Loss: 0.314 | AUC: 0.884
 Epoch 12 : Loss: 0.280 | AUC: 0.900
 Epoch 13 : Loss: 0.246 | AUC: 0.915
 Epoch 14 : Loss: 0.224 | AUC: 0.928
 Epoch 15 : Loss: 0.206 | AUC: 0.931
 Epoch 16 : Loss: 0.181 | AUC: 0.945
 Epoch 17 : Loss: 0.174 | AUC: 0.948
 Epoch 18 : Loss: 0.160 | AUC: 0.952
 Epoch 19 : Loss: 0.149 | AUC: 0.956
 Epoch 20 : Loss: 0.142 | AUC: 0.959
 Epoch 21 : Loss: 0.135 | AUC: 0.961
 Epoch 22 : Loss: 0.132 | AUC: 0.961
 Epoch 23 : Loss: 0.129 | AUC: 0.962
 Epoch 24 : Loss: 0.125 | AUC: 0.962
 Epoch 25 : Loss: 0.118 | AUC: 0.969
 Epoch 26 : Loss: 0.114 | AUC: 0.969
 Epoch 27 :

In [114]:
def find_similar_artists(artist=None, num_items=10):
    """Find artists similar to an artist.
    Args:
        artist (str): The name of the artist we want to find similar artists for
        num_items (int): How many similar artists we want to return.
    Returns:
        similar (pandas.DataFrame): DataFrame with num_items artist names and scores
    """

    # Grab our User matrix U
    user_vecs = get_variable(graph, session, 'user_factors')

    # Grab our Item matrix V
    item_vecs = get_variable(graph, session, 'item_factors')

    # Grab our item bias
    item_bi = get_variable(graph, session, 'item_bias').reshape(-1)

    # Get the item id for Lady GaGa
    item_id = int(item_lookup[item_lookup.artist == artist]['artist_id'])

    # Get the item vector for our item_id and transpose it.
    item_vec = item_vecs[item_id].T

    # Calculate the similarity between Lady GaGa and all other artists
    # by multiplying the item vector with our item_matrix
    scores = np.add(item_vecs.dot(item_vec), item_bi).reshape(1,-1)[0]

    # Get the indices for the top 10 scores
    top_10 = np.argsort(scores)[::-1][:num_items]

    # We then use our lookup table to grab the names of these indices
    # and add it along with its score to a pandas dataframe.
    artists, artist_scores = [], []
    
    for idx in top_10:
        artists.append(item_lookup.artist.loc[item_lookup.artist_id == str(idx)].iloc[0])
        artist_scores.append(scores[idx])

    similar = pd.DataFrame({'artist': artists, 'score': artist_scores})

    return similar

print(find_similar_artists(artist='beyoncé'))

                 artist     score
0       danny fernandes  8.407548
1               beyoncé  8.157237
2                  hoku  7.960122
3                 ciara  7.907829
4         janet jackson  7.778776
5                cassie  7.777456
6  gaiola das popozudas  7.659938
7           brit & alex  7.657467
8             m. pokora  7.651463
9              deepside  7.610708


In [116]:
def make_recommendation(user_id=None, num_items=10):
    """Recommend items for a given user given a trained model
    Args:
        user_id (int): The id of the user we want to create recommendations for.
        num_items (int): How many recommendations we want to return.
    Returns:
        recommendations (pandas.DataFrame): DataFrame with num_items artist names and scores
    """

    # Grab our user matrix U
    user_vecs = get_variable(graph, session, 'user_factors')

    # Grab our item matrix V
    item_vecs = get_variable(graph, session, 'item_factors')

    # Grab our item bias
    item_bi = get_variable(graph, session, 'item_bias').reshape(-1)

    # Calculate the score for our user for all items. 
    rec_vector = np.add(user_vecs[user_id, :].dot(item_vecs.T), item_bi)

    # Grab the indices of the top users
    item_idx = np.argsort(rec_vector)[::-1][:num_items]

    # Map the indices to artist names and add to dataframe along with scores.
    artists, scores = [], []

    for idx in item_idx:
        artists.append(item_lookup.artist.loc[item_lookup.artist_id == str(idx)].iloc[0])
        scores.append(rec_vector[idx])

    recommendations = pd.DataFrame({'artist': artists, 'score': scores})

    return recommendations

print(make_recommendation(user_id=0))

             artist     score
0      mamá ladilla  3.993062
1        les wampas  3.982560
2            eiffel  3.948097
3  os cascavelletes  3.891437
4     oliver onions  3.879301
5             narco  3.836826
6             trust  3.815100
7        f.r. david  3.811432
8        gramofocas  3.794619
9          gigatron  3.759100
