In [1]:
import pandas as pd
import numpy as np
from typing import Callable

In [2]:
#construct interaction matrix from dataset
def inter_matr_implicit(users: pd.DataFrame,
                        items: pd.DataFrame,
                        interactions: pd.DataFrame,
                        dataset_name: str,
                        threshold=1) -> np.ndarray:
    '''
    users - pandas Dataframe, use it as loaded from the dataset;
    items - pandas Dataframe, use it as loaded from the dataset;
    interactions - pandas Dataframe, use it as loaded from the dataset;
    dataset_name - string out of ["lfm-ismir", "ml-1m"], name of the dataset, used in case there are differences in the column names of the data frames;
    threshold - int > 0, criteria of a valid interaction

    returns - 2D np.array, rows - users, columns - items;
    '''
    #initialize the binary interaction matrix with only zeros
    res = np.zeros((users.shape[0], items.shape[0]))

    if dataset_name == "lfm-tiny":
        #iterate over the interactions
        
        
        for i in range(interactions.shape[0]):

            #check if the listening events are enough to consider the interaction valid
           
            if interactions.iloc[i, 2] >= threshold:    
                #add the interaction to the matrix 
                          
                res[interactions.iloc[i, 0], interactions.iloc[i, 1]] = 1
    
    
    elif dataset_name == "ml-1m":
        
        #iterate over the interactions
        for i in range(interactions.shape[0]):

            #if the interaction is valid, add it to the matrix
            if interactions.iloc[i, 2] >= threshold:
                res[interactions.iloc[i, 0], interactions.iloc[i, 1]] = 1
    
    
    
    return res

In [3]:
# Use LFM-Tiny dataset from exercise 1
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

users = read("lfm-tiny", 'user')
items = read("lfm-tiny", 'item')
interactions = read("lfm-tiny", 'inter')


_interaction_matrix_test = inter_matr_implicit(users, items, interactions, "lfm-tiny", 1)
_interaction_matrix_test

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])


$JaccardScore(a,b) = \frac{|U(a) \wedge U(b)|}{|U(a) \vee U(b)|}$


In [4]:
def jaccard_score(a: np.array, b: np.array) -> float:
    """
    a, b: - vectors of the same length corresponding to the two items

    returns: float - jaccard similarity score for a and b
    """
    score = None

    # compute the jaccard similarity score for a and b
    score = np.sum(np.logical_and(a, b)) / np.sum(np.logical_or(a, b))

    return float(score)

In [5]:
def calculate_sim_scores(similarity_measure: Callable[[int, int], float],
                         inter: np.array,
                         target_vec: np.array) -> np.array:
    """
    similarity_measure: Callable - function that measures similarity, use your jaccard_score function from above
    inter: np.array - interaction matrix - calculate similarity between each item and the target item (see below)
    target_vec: np.array - target item vector
    
    returns: np.array - similarities between every item from <inter> and <target_vec> in the respective order
    """

    #calculate similarities between every item from <inter> and <target_vec> in the respective order
    item_similarities = [similarity_measure(target_vec, inter[:, i]) for i in range(inter.shape[1])]

    return np.array(item_similarities)

In [6]:

item_sims = calculate_sim_scores(similarity_measure=jaccard_score, inter=_interaction_matrix_test, target_vec=_interaction_matrix_test[:,0])
print(item_sims)

[1.         0.02083333 0.01030928 0.03529412 0.01449275 0.05555556
 0.04347826 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.04545455 0.
 0.         0.04545455 0.11764706 0.05882353 0.05882353 0.02150538
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.01898734 0.         0.         0.04545455 0.
 0.         0.         0.         0.01265823 0.         0.
 0.         0.         0.0625     0.         0.         0.05
 0.05555556 0.05555556 0.08695652 0.05769231 0.0625     0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.07142857 0.         0.         0.
 0.0625     0.01315789 0.0625     0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.04       0.         0.         0.         0.         0.03174603
 0.         0.         0.01234

This function that takes a full interaction matrix as an input, as well as user id, item id and *n* -- algorithm's hyperparameter, number of neighbors to be considered while calculating the score.

The expected output is a single number between 0 and 1 - the predicted score.

In [7]:
def get_user_item_score(sim_scores_calculator: Callable[[Callable, np.array, np.array], np.array],
                        inter: np.array,
                        target_user: int,
                        target_item: int,
                        n: int = 2) -> float:
    """
    sim_scores_calculator: Callable - function that calculates similarities, using calculate_sim_scores
                                      from above, already defined in the next cell
    inter: np.array - interaction matrix
    target_user: target user id
    target_item: int - target item id
    n: int - n closest neighbors to consider for the score prediction
    
    returns: float - mean of similarity scores = user-item 'fitness' score
    """

    item_similarities_mean = None
    
    #copy interaction matrix
    inter_matr_c = inter.copy()
    
    #take item consumed by the user
    indices = np.where(inter_matr_c[target_user, :] == 1)[0]
    
    #get the column vector of the target item
    target_item_vec = inter_matr_c[:, target_item]
    
    sim_index_list = []
    for index in indices:
        #get the column vector the item consumed by the user
        seen_item_vec = inter_matr_c[:, index]
        #calculate the similarity between the target item and the item consumed by the user
        sim = jaccard_score(seen_item_vec, target_item_vec)
        #add the similarity to the list
        sim_index_list.append(sim)
    
    #sort the list of similarities from higher to lower & keep top n items only
    sorted = np.sort(sim_index_list)[::-1][:n]
   
    return np.mean(sorted)

The function takes user_item_scorer, a full interaction matrix, user id, top_k, hyperparameter n as an input. It returns two arrays:

* top_k recommendations for the given user obtained considering n neighbors for score prediction
* the corresponding user-item similarity scores

In [8]:
def recTopK(user_item_scorer: Callable[[Callable, np.array, int, int], float],
            inter_matr: np.array,
            user: int,
            top_k: int,
            n: int) -> (np.array, np.array):
    '''
    user_item_scorer: Callable - wrapper function that calculates user-item score, using get_user_item_score function
                                 from above, already defined in the next cell
    inter_matr: np.array - interaction matrix
    user: int -  user_id
    top_k: int - expected length of the resulting list
    n: int - number of neighbors to consider
    
    returns - array of recommendations (sorted in the order of descending scores) & array of corresponding scores
    '''

    # calculate user-item scores for non-seen items
    user_item_sc = []
    # Loop over all items in the interaction matrix
    for item_id in range(inter_matr.shape[1]):
        #if the item is not consumed by the user, calulcate the score
        if inter_matr[user, item_id] == 0:
            item_score = user_item_scorer(inter_matr, user, item_id, n)
            user_item_sc.append(item_score)
        #otherwise, add a score of zero
        else:
            user_item_sc.append(0)
    
    user_item_sc = np.array(user_item_sc)
    
    # select the top top_k items with the highest scores
    top_rec = np.argsort(user_item_sc)[::-1][:top_k]
    scores = user_item_sc[top_rec]
    
    return top_rec, scores

Now, lets use these scoring functions and get the <b>top 10</b> recommendations for <b>user 0</b> with <b>n = 15</b>.

In [10]:
def sim_score_calc(inter, target_vec): return calculate_sim_scores(jaccard_score, inter, target_vec)
def user_item_scorer(inter, target_user, target_item, n): return get_user_item_score(sim_score_calc, inter,
                                                                                     target_user, target_item, n)
#e.g. TOP 10 recommendation for user 0 with n=15
rec_item_cf, scores_item_cf = recTopK(user_item_scorer, _interaction_matrix_test, 0, 10, 15)

In [11]:
print("Recommendations with Item CF: ", rec_item_cf)
print("With Scores: ", scores_item_cf)
print("-" * 75)

Recommendations with Item CF:  [117  51  12  43 129  56  30 167  98   8]
With Scores:  [0.05700778 0.05121532 0.05023276 0.04848317 0.04670654 0.04642857
 0.04491342 0.04259023 0.03993695 0.03945571]
---------------------------------------------------------------------------
