# Recommendations computing

## Import librairies

In [1]:
import os
import pickle

import pandas as pd
import numpy as np

import surprise
from surprise import Reader
from surprise.model_selection import GridSearchCV

from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.append(r"D:\Data\Google Drive\Openclassrooms\P9\Recommenders")
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.recommender.surprise.surprise_utils import compute_ranking_predictions, predict
from reco_utils.evaluation.python_evaluation import *

print("System version: {}".format(sys.version))
print("Surprise version: {}".format(surprise.__version__))

System version: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
Surprise version: 1.1.1


## Utils functions

In [27]:
# Functions to compute the content base and collaborative filtering

##############################
# Tools
##############################

def get_user_info(user_id, interactions, user_col, info_col):
    # Retrieve the columns 'info_col' from the DataFrame 'interactions' for a given 'user_id'
    user_items = interactions.loc[interactions[user_col] == user_id, info_col]
    return(user_items.to_numpy())

def get_item_profiles(items_ids, item_profiles):
    # Retrieve 'item_profiles' for the given 'item_ids'
    return(item_profiles[items_ids])

def get_user_profile(user_id, interactions, item_profiles, user_col, item_col, score_col):
    # Compute a user profile as the weighted average of the items the user interacted with.
    # The weights are the scores/ratings of the items the user interacted with.
    user_items = get_user_info(user_id, interactions, user_col, item_col)
    user_scores = get_user_info(user_id, interactions, user_col, score_col)
    user_scores = np.nan_to_num(user_scores)
    user_item_profiles = get_item_profiles(user_items, item_profiles)
    user_profile = np.dot(user_scores, user_item_profiles)
    user_profile = user_profile / user_scores.sum()
    return(user_profile)

##############################
# Content base filtering
##############################
def get_user_content_vectorized(user_ids, nth, interactions_, item_profiles):
    """
    Compute the top nth recommendations for all users in user_ids
    The recommendations are the top n values of the utility matrix.
    The utility matrix providing for each user/item its score as the 
    cosine similarity between the user profile and the item embedding. 
    The user profile is a weighted average of the items seen by the users.
    Weights are the implicit ratings of each item seen.
    
    Inputs:
    - user_ids: list of user ids
    - nth: number of recommendation to provide for each user
    - interactions_: DataFrame of recommendations providing 'userID', 'itemID' and 'rating'
    - item_profiles: numpy array with the items embeddings in columns, line number is the item id

    Outputs:
    - utility_matrix: numpy array
    - top_n: numpy array of the n best recommendations for each users (users in line, recommendations in
    columns)
    - top_n_df: same as top_n but as a DataFrame
    """
    from sklearn.metrics.pairwise import cosine_similarity
    user_profiles = [get_user_profile(i, interactions_, item_profiles,'userID', 'itemID','rating') \
                    for i in user_ids]

    utility_matrix = cosine_similarity(user_profiles, item_profiles)

    top_n = np.argpartition(utility_matrix,kth=-nth,axis=-1)[:,-nth:]
    top_n = np.flip(top_n, axis=-1)
    top_n_df = pd.DataFrame(top_n[:,0:nth], index = user_ids)
    return(utility_matrix, top_n, top_n_df)

def assess_utility_matrix(user_ids, utility_matrix, interactions_):
    """
    Compute the quality of the utility matrix. For each item seen 
    by a user, the function computes the distance between its rating 
    and the best score for that user in the utility matrix.
    
    Inputs:
    - user_ids: list of user ids
    - interactions_: DataFrame of recommendations providing 'userID', 'itemID' and 'rating'
    - utility_matrix: numpy array

    Outputs:
    - dist_from_best
    """
    for user_id in user_ids:
            user_items = interactions_.loc[interactions_["userID"] == user_id, "itemID"].to_numpy()
            user_recos = np.argsort(utility_matrix[user_id,:])
            user_recos = np.flip(user_recos)
            dist_from_best.append(np.argwhere(np.isin(user_recos, user_items)) / len(user_recos) * 100) 
    return(dist_from_best)

##############################
# Collaborative filtering
##############################
def compute_ranking_predictions(
    algo,
    data,
    usercol='userID',
    itemcol='itemID',
    predcol='prediction',
    remove_seen=False,
    k=10,
    user_id=None,
):
    """Computes predictions of an algorithm from Surprise on all users and items in data. It can be used for computing
    ranking metrics like NDCG.
    
    Args:
        algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
        data (pd.DataFrame): the data from which to get the users and items
        usercol (str): name of the user column
        itemcol (str): name of the item column
        remove_seen (bool): flag to remove (user, item) pairs seen in the training data
    
    Returns:
        pd.DataFrame: dataframe with usercol, itemcol, predcol
    """
    preds_lst = []
    if user_id == None:
        users = data[usercol].unique()
    else:
        users = [user_id]
    items = data[itemcol].unique()

    for user in users:
        preds_usr = []
        for item in items:
            # preds_lst.append([user, item, algo.predict(user, item).est])
            preds_usr.append([user, item, algo.predict(user, item).est])
        preds_usr_df = pd.DataFrame(data=preds_usr, columns=[usercol, itemcol, predcol])
        preds_usr_df = preds_usr_df.sort_values(by=predcol, ascending=False)[0:k]
        preds_lst.extend(preds_usr_df.values.tolist())

    all_predictions = pd.DataFrame(data=preds_lst, columns=[usercol, itemcol, predcol])

    if remove_seen:
        tempdf = pd.concat(
            [
                data[[usercol, itemcol]],
                pd.DataFrame(
                    data=np.ones(data.shape[0]), columns=["dummycol"], index=data.index
                ),
            ],
            axis=1,
        )
        merged = pd.merge(tempdf, all_predictions, on=[usercol, itemcol], how="outer")
        return merged[merged["dummycol"].isnull()].drop("dummycol", axis=1)
    else:
        return all_predictions

## Load and preprocess the data
'interactions' were preprocessed during an earlier step (in the notebook 'eda').  
It contains the required information for a recommender models: 'userID', 'itemID' and 'rating'

'rating' was computed (in the notebook 'eda') as the time spent on an article normalized by its length (number of words).

In [3]:
# Import files
data_path = './data'
interactions_file = os.path.join(data_path, 'clicks_light.csv')
item_profiles_file = os.path.join(data_path, 'articles_embeddings.pickle')

interactions = pd.read_csv(interactions_file)

with open(item_profiles_file, mode='rb') as f:
    item_profiles = pickle.load(f)

interactions = interactions.rename(columns={'rating1':'rating'})

In [7]:
interactions.head()

Unnamed: 0,userID,itemID,rating
0,0,157541,1.673888
1,0,68866,
2,0,96755,2.150506
3,0,313996,
4,0,160158,2.304974


**Preprocess the file for collaborative filtering**  
To work efficiently, collaborative filtering requires users to have had some interactions with the system. Consequently, we will apply collaborative filtering only to users with more than 10 interactions.  

We hereunder extract the relevant users interactions hereunder in the 'interactions_' DataFrame.

In [8]:
# Extraction des utilisateurs ayant eu plus de 10 interactions
counts = interactions.groupby(by='userID').count()
counts = counts[counts['itemID']>10].index.values
interactions_ = interactions[interactions['userID'].isin(counts)]

print(f"Size of the initial interactions = {len(interactions)}")
print(f"Size of interactions (>10) = {len(interactions_)}")

# Suppression des interactions pour lesquelles le rating 
rating_to_drop = np.nan
to_drop = interactions_[interactions_['rating'].isna()].index
interactions_ = interactions_.drop(index=to_drop)
print(f"Size of interactions (>10, w/o NaN) = {len(interactions_)}")

Size of the initial interactions = 2987858
Size of interactions (>10) = 1979594
Size of interactions (>10, w/o NaN) = 1355289


## Baseline (collaborative filtering)
Hereunder, we calculate the RMSE when all ratings are estimated as the global average of known ratings.

In [9]:
# Error made if we estimate ratings by the global average
std = interactions_['rating'].std()
print(f"RMSE mean: {std}")

RMSE mean: 0.7729782188214214


## Collaborative filtering

**Training**

In [10]:
train_model = True

if train_model:
    from reco_utils.common.timer import Timer

    # Define the train-set
    reader = Reader(rating_scale=(interactions_['rating'].min(),interactions_['rating'].max()))
    train_set = surprise.Dataset.load_from_df(interactions_[['userID', 'itemID', 'rating']], reader=reader).build_full_trainset()

    rmse_l = []

    for n_factors in [10, 50, 100, 500, 1000]:

        print(f"\n n_factors = {n_factors}")

        # Train the collaborative filter
        svd = surprise.SVD(random_state=0, n_factors=n_factors, n_epochs=10, verbose=False)
        with Timer() as train_time:
            svd.fit(train_set)
        print("Took {} seconds for training.".format(train_time.interval))

        # Predict the score for all interactions where we have an implicit rating
        with Timer() as pred_time:
            preds = predict(svd, interactions_)
        print("Took {} seconds for prediction.".format(pred_time.interval))

        # Compute the error made by the above predictions
        err2 = (preds['prediction'].to_numpy() - interactions_['rating'].to_numpy())**2
        rmse = np.sqrt(err2.mean())
        print(f"RMSE svd: {rmse}")
        rmse_l.append(rmse)

    # Save the model
    print("\nDump the model parameters in a pickle file")
    with open('./data/cf.pickle', 'wb') as f:
            pickle.dump(svd, f)


 n_factors = 10
Took 13.450069900000003 seconds for training.
Took 17.6196523 seconds for prediction.
RMSE svd: 0.6731601414357692

 n_factors = 50
Took 28.7093785 seconds for training.
Took 16.89359780000001 seconds for prediction.
RMSE svd: 0.6601754215241478

 n_factors = 100
Took 45.95308489999999 seconds for training.
Took 17.324730799999998 seconds for prediction.
RMSE svd: 0.6444015331334414

 n_factors = 500
Took 255.8742438 seconds for training.
Took 16.48382730000003 seconds for prediction.
RMSE svd: 0.5458838547701672

 n_factors = 1000
Took 510.46200039999997 seconds for training.
Took 17.080846500000007 seconds for prediction.
RMSE svd: 0.4629692290912044

Dump the model parameters in a pickle file


**Compute the recommendations**

In [18]:
# Calculer la liste des recommendations pour les userID <= 100
# et faisant parti des utilisateurs ayant eu plus de 10 interactions
users_list_ = interactions_['userID'].unique()
users_list_ = users_list_[users_list_<=100]
interactions_ = interactions[interactions['userID'].isin(users_list_)]

cf_rankings = compute_ranking_predictions(svd, 
                                          interactions_, 
                                          usercol='userID',
                                          itemcol='itemID',
                                          remove_seen=True)

# Nombre minimum de recommendations retenues par user
cf_rankings.groupby(by='userID').count()['prediction'].min()

6

**Save the predictions for future upload in a CosmosDB**

In [19]:
collab = [cf_rankings.loc[cf_rankings['userID']==userID, 'itemID'].tolist() for userID in users_list_]
collab_df = pd.DataFrame(collab)
collab_df['userID'] = users_list_

In [20]:
collab_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,userID
0,348109,202534,107299,107298,355163,158535,107308.0,158541.0,106984.0,107289.0,0
1,107298,209723,61741,42937,202534,107308,107214.0,348109.0,158704.0,216183.0,1
2,348109,202534,107299,107298,355163,158535,107308.0,158541.0,106984.0,107289.0,2
3,107299,276783,348109,106984,195191,61169,145096.0,308930.0,277486.0,300048.0,3
4,348109,202534,107299,107298,355163,158535,107308.0,158541.0,106984.0,107289.0,4


In [21]:
collab_df.to_csv("./to_cosmosdb/collab.csv", index=False)

## Content based filtering

In [30]:
from time import process_time
n_users = 100
nth = 10

# Init
start_time =process_time()
user_ids = list(range(n_users))
interactions_ = interactions
user_profiles = [get_user_profile(i, interactions_, item_profiles,'userID', 'itemID','rating') \
                 for i in user_ids]

# Content based filtering
utility_matrix, top_n, top_n_df = get_user_content_vectorized(user_ids, nth, interactions_, item_profiles)
print(f"Compute the top {nth} recommendations for {n_users} users: {process_time()-start_time} secondes")



Compute the top 10 recommendations for 100 users: 7.15625 secondes
Evaluation for 100 users: 5.625 secondes


In [56]:
# Assess the utility matrix computed for the content based filtering
start_time =process_time()
dist_from_best = assess_utility_matrix(user_ids, utility_matrix, interactions_)
print(f"Evaluation for {n_users} users: {process_time()-start_time} secondes")

# Compute the average distance of items seen by the user from the best recommendation when looking 
mean = []
for i in range(20):
    mean.append(0)
    for metric in dist_from_best:
        metric = metric[0:i+1]
        mean[i] = mean[i] + metric.mean()
    mean[i] = mean[i] / len(dist_from_best)

print("\nAverage distance from the best recommendation when considering x number of items seen by a user:")
mean

Evaluation for 100 users: 5.921875 secondes

Average distance from the best recommendation when considering x number of items seen by a user:


[0.0968501319884519,
 0.694065601419597,
 1.1585537032306275,
 2.0236086732390786,
 2.8017158956581634,
 3.57706102050925,
 4.375838441582704,
 5.200661835657795,
 5.602002031963341,
 6.1238111817587075,
 6.8142416381658215,
 7.386731508146795,
 7.8395433562647705,
 8.197199279631986,
 8.522358027449352,
 8.887168054268228,
 9.2473945396493,
 9.630949670638552,
 9.939209639705924,
 10.273802613747893]

**Save the predictions for future upload in a CosmosDB**

In [54]:
top_n_df.to_csv("./to_cosmosdb/content.csv", index_label="userID")

# Connect to CosmosDB

In [None]:
from azure.cosmos import CosmosClient

In [None]:
# Initialize the Cosmos client
endpoint = "https://reco.documents.azure.com:443/"
key = 'h9opC3onnYUSdx1muRG7Tav6rmE45fTgQocnCR7oefBp4hF0bqbx94Oum5XYyetNtIXNtjTVoIRbH30vKNVwbA=='

# Connect to containers
client = CosmosClient(endpoint, key)
database = client.get_database_client('reco')
contentbase = database.get_container_client('contentbase')
collab = database.get_container_client('collab')

# Query them in SQL
userID = 0
query = 'SELECT c["0"],c["1"],c["2"],c["3"],c["4"],c["5"] FROM c WHERE c.userID=' + str(userID)    
it_collab = collab.query_items(query,enable_cross_partition_query=True)
it_contentbase = contentbase.query_items(query,enable_cross_partition_query=True)

# Extract
try:
    recos_collab = list(next(it_collab).values())
    print(f"collab: {recos_collab}")
except StopIteration:
    recos_contentbase = list(next(it_contentbase).values())
    print(f'contentbase: {recos_contentbase}')

# Back-up

In [None]:
from numba import guvectorize
@guvectorize(["float64[:], float64[:,:], float64[:]"], '(n),(m,n)->(m)', nopython=True, target='parallel')
def cosine_similarity_numba(u, v, cos_theta):
    assert u.shape[0] == v.shape[1]
    for j in range(v.shape[0]):
        uv = 0
        uu = 0
        vv = 0
        for i in range(u.shape[0]):
            uv += u[i]*v[j,i]
            uu += u[i]*u[i]
            vv += v[j,i]*v[j,i]
        cos_theta[j] = 1
        if uu != 0 and vv != 0:
            cos_theta[j] = uv / np.sqrt(vv * uu)

In [None]:
def list_enum(n):
    for i in range(n):
        yield(i)

x = list_enum(5)

next(x)

In [None]:
def get_user_content_vectorized(user_ids, nth, interactions_, item_profiles):
    """
    Compute the top nth recommendations for all users in user_ids
    The recommendations are the top n values of the utility matrix.
    The utility matrix providing for each user/item its score as the 
    cosine similarity between the user profile and the item embedding. 
    The user profile is a weighted average of the items seen by the users.
    Weights are the implicit ratings of each item seen.
    
    Inputs:
    - user_ids: list of user ids
    - nth: number of recommendation to provide for each user
    - interactions_: DataFrame of recommendations providing 'userID', 'itemID' and 'rating'
    - item_profiles: numpy array with the items embeddings in columns, line number is the item id

    Outputs:
    - utility_matrix: numpy array
    - top_n: numpy array of the n best recommendations for each users (users in line, recommendations in
    columns)
    - top_n_df: same as top_n but as a DataFrame
    """
    from sklearn.metrics.pairwise import cosine_similarity
    user_profiles = [get_user_profile(i, interactions_, item_profiles,'userID', 'itemID','rating') \
                    for i in user_ids]

    utility_matrix = cosine_similarity(user_profiles, item_profiles)

    top_n = np.argpartition(utility_matrix,kth=-nth,axis=-1)[:,-nth:]
    top_n = np.flip(top_n, axis=-1)
    top_n_df = pd.DataFrame(top_n[:,0:nth], index = user_ids)
    return(utility_matrix, top_n, top_n_df)

def assess_utility_matrix(user_ids, utility_matrix, interactions_):
    """
    Compute the quality of the utility matrix. For each item seen 
    by a user, the function computes the distance between its rating 
    and the best score for that user in the utility matrix.
    
    Inputs:
    - user_ids: list of user ids
    - interactions_: DataFrame of recommendations providing 'userID', 'itemID' and 'rating'
    - utility_matrix: numpy array

    Outputs:
    - dist_from_best
    """
    for user_id in user_ids:
            user_items = interactions_.loc[interactions_["userID"] == user_id, "itemID"].to_numpy()
            user_recos = np.argsort(utility_matrix[user_id,:])
            user_recos = np.flip(user_recos)
            dist_from_best.append(np.argwhere(np.isin(user_recos, user_items)) / len(user_recos) * 100) 
    return(dist_from_best)

In [93]:
import plotly.express as px
x = np.array([10, 50, 100, 500, 1000])
y = np.array([0.67, 0.66, 0.64, 0.54, 0.46])
df = pd.DataFrame(np.array([x,y]).T, columns=['#factors', 'RMSE'])

fig = px.scatter(df, x='#factors', y='RMSE', title="Collaborative Filtering", range_y=[0,1])
fig.add_hline(y=0.77)


In [106]:
import plotly.express as px
x = np.arange(20)
y = np.array(mean)
label_x = "On regarde x éléments vus par le user"
label_y = "Rang moyen en %"

df = pd.DataFrame(np.array([x,y]).T, columns=[label_x, label_y])

px.scatter(df, x=label_x, y=label_y, title="Content Based Filtering")