In [14]:
from tensorflow_addons.metrics import RSquare
from keras.models import load_model
import pandas as pd
import numpy as np
import joblib

### Mean Average Precision - MAP@k

In [15]:
# modified from:
# author: Ben Hamner
# author's github: benhamner
# link to github: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 

def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    apk_sum = 0.0
    for user in actual:
        if user in predicted:
            apk_sum += apk(actual[user], predicted[user], k)

    return apk_sum / len(actual)

### Mean Reciprocal Rank - MRR

In [16]:
def mrr(actual, predicted):
    mrr_sum = 0.0
    for user in actual:
        if user in predicted:
            rank = 1
            for movie in predicted[user]:
                if movie in actual[user]:
                    mrr_sum += 1.0 / rank
                    break
                rank += 1
    return mrr_sum / len(actual)

### Normalized Discounted Cumulative Gain - NDCG

In [17]:
# code from: https://gist.github.com/tgsmith61591/d8aa96ac7c74c24b33e4b0cb967ca519

# -*- coding: utf-8 -*-
#
# Author: Taylor G Smith
#
# Recommender system ranking metrics derived from Spark source for use with
# Python-based recommender libraries (i.e., implicit,
# http://github.com/benfred/implicit/). These metrics are derived from the
# original Spark Scala source code for recommender metrics.
# https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala

def _require_positive_k(k):
    """Helper function to avoid copy/pasted code for validating K"""
    if k <= 0:
        raise ValueError("ranking position k should be positive")


def _mean_ranking_metric(predictions, labels, k, metric):
    """Helper function for precision_at_k and mean_average_precision"""
    # do not zip, as this will require an extra pass of O(N). Just assert
    # equal length and index (compute in ONE pass of O(N)).
    # if len(predictions) != len(labels):
    #     raise ValueError("dim mismatch in predictions and labels!")
    # return np.mean([
    #     metric(np.asarray(predictions[i]), np.asarray(labels[i]))
    #     for i in xrange(len(predictions))
    # ])

    # Actually probably want lazy evaluation in case preds is a
    # generator, since preds can be very dense and could blow up
    # memory... but how to assert lengths equal? FIXME
    return np.mean(
        [
            metric(np.asarray(prd), np.asarray(labels[i]), k)
            for i, prd in enumerate(predictions)  # lazy eval if generator
        ]
    )


def _warn_for_empty_labels():
    """Helper for missing ground truth sets"""
    print("Empty ground truth set! Check input data")
    return 0.0

def ndcg_at(predictions, labels, k=10, assume_unique=True):
    """Compute the normalized discounted cumulative gain at K.

    Compute the average NDCG value of all the queries, truncated at ranking
    position k. The discounted cumulative gain at position k is computed as:

        sum,,i=1,,^k^ (2^{relevance of ''i''th item}^ - 1) / log(i + 1)

    and the NDCG is obtained by dividing the DCG value on the ground truth set.
    In the current implementation, the relevance value is binary.

    If a query has an empty ground truth set, zero will be used as
    NDCG together with a warning.

    Parameters
    ----------
    predictions : array-like, shape=(n_predictions,)
        The prediction array. The items that were predicted, in descending
        order of relevance.

    labels : array-like, shape=(n_ratings,)
        The labels (positively-rated items).

    k : int, optional (default=10)
        The rank at which to measure the NDCG.

    assume_unique : bool, optional (default=True)
        Whether to assume the items in the labels and predictions are each
        unique. That is, the same item is not predicted multiple times or
        rated multiple times.

    Examples
    --------
    >>> # predictions for 3 users
    >>> preds = [[1, 6, 2, 7, 8, 3, 9, 10, 4, 5],
    ...          [4, 1, 5, 6, 2, 7, 3, 8, 9, 10],
    ...          [1, 2, 3, 4, 5]]
    >>> # labels for the 3 users
    >>> labels = [[1, 2, 3, 4, 5], [1, 2, 3], []]
    >>> ndcg_at(preds, labels, 3)
    0.3333333432674408
    >>> ndcg_at(preds, labels, 10)
    0.48791273434956867

    References
    ----------
    .. [1] K. Jarvelin and J. Kekalainen, "IR evaluation methods for
           retrieving highly relevant documents."
    """
    # validate K
    _require_positive_k(k)

    def _inner_ndcg(pred, lab, k=10):
        if lab.shape[0]:
            # if we do NOT assume uniqueness, the set is a bit different here
            if not assume_unique:
                lab = np.unique(lab)

            n_lab = lab.shape[0]
            n_pred = pred.shape[0]
            n = min(max(n_pred, n_lab), k)  # min(min(p, l), k)?

            # similar to mean_avg_prcsn, we need an arange, but this time +2
            # since python is zero-indexed, and the denom typically needs +1.
            # Also need the log base2...
            arange = np.arange(n, dtype=np.float32)  # length n

            # since we are only interested in the arange up to n_pred, truncate
            # if necessary
            arange = arange[:n_pred]
            denom = np.log2(arange + 2.0)  # length n
            gains = 1.0 / denom  # length n

            # compute the gains where the prediction is present in the labels
            dcg_mask = np.in1d(pred[:n], lab, assume_unique=assume_unique)
            dcg = gains[dcg_mask].sum()

            # the max DCG is sum of gains where the index < the label set size
            max_dcg = gains[arange < n_lab].sum()
            return dcg / max_dcg

        else:
            return _warn_for_empty_labels()

    return _mean_ranking_metric(predictions, labels, k, _inner_ndcg)


-----------

In [18]:
################### Experiment 4 - 1m ###################
 
# ## 1m No context
# test_split_data_path = 'eval_data/test_split_1ml_no_context.csv'
# recsys_data_path = '../data/transform_data/1m_no_context/'
# nn_model_path = '../model/experiment_4/arch5_1m_no_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '1m_no_context_scaler.pkl'

# ## 1m MovieLens context
# test_split_data_path = 'eval_data/test_split_1ml_movielens_context.csv'
# recsys_data_path = '../data/transform_data/1m_movielens_context/'
# nn_model_path = '../model/experiment_4/arch5_1m_movielens_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '1m_movielens_context_scaler.pkl'

# ## 1m Added IMDb context
# test_split_data_path = 'eval_data/test_split_1ml_added_imdb_context.csv'
# recsys_data_path = '../data/transform_data/1m/'
# nn_model_path = '../model/experiment_4/arch5_1m_added_imdb_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '1m_added_imdb_context_scaler.pkl'

################### Experiment 4 - 25m ###################

# ## 25m Added IMDb context
# test_split_data_path = 'eval_data/test_split_25ml_added_imdb_context.csv'
# recsys_data_path = '../data/transform_data/25m/'
# nn_model_path = '../model/experiment_4/arch5_25m_added_imdb_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '25m_added_imdb_context_scaler.pkl'

# ## 25m MovieLens context
# test_split_data_path = 'eval_data/test_split_25ml_movielens_context.csv'
# recsys_data_path = '../data/transform_data/25m_movielens_context/'
# nn_model_path = '../model/experiment_4/arch5_25m_movielens_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '25m_movielens_context_scaler.pkl'

# ## 25m No context
# test_split_data_path = 'eval_data/test_split_25ml_no_context.csv'
# recsys_data_path = '../data/transform_data/25m_no_context/'
# nn_model_path = '../model/experiment_4/arch5_25m_no_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '25m_no_context_scaler.pkl'

################### 25m - profiles ###################

# ## 25m Added IMDb context - PROFILE 1
# test_split_data_path = 'eval_data/test_split_profile_1.csv'
# recsys_data_path = '../data/transform_data/profile_1/'
# nn_model_path = '../model/profiles/arch10_25m_profile_1_gn_trained.keras'
# scaler_file = 'scaler_profile_1.pkl'

# ## 25m Added IMDb context - PROFILE 2
# test_split_data_path = 'eval_data/test_split_profile_2.csv'
# recsys_data_path = '../data/transform_data/profile_2/'
# nn_model_path = '../model/profiles/arch10_25m_profile_2_gn_trained.keras'
# scaler_file = 'scaler_profile_2.pkl'

# ## 25m Added IMDb context - PROFILE 3
# test_split_data_path = 'eval_data/test_split_profile_3.csv'
# recsys_data_path = '../data/transform_data/profile_3/'
# nn_model_path = '../model/profiles/arch10_25m_profile_3_gn_trained.keras'
# scaler_file = 'scaler_profile_3.pkl'

################### 25m - Achritecture 8 vs Achritecture 10 ###################

# ## 25m Added IMDb context - Achritecture 8
# test_split_data_path = 'eval_data/test_split_25ml_added_imdb_context.csv'
# recsys_data_path = '../data/transform_data/25m/'
# nn_model_path = '../model/arch8_arch10/arch8_25m_added_imdb_context_max_abs_scaler_hpt_gn_trained.keras'
# scaler_file = '25m_added_imdb_context_scaler.pkl'

# ## 25m Added IMDb context - Achritecture 10
# test_split_data_path = 'eval_data/test_split_25ml_added_imdb_context.csv'
# recsys_data_path = '../data/transform_data/25m/'
# nn_model_path = '../model/arch8_arch10/arch10_25m_added_imdb_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '25m_added_imdb_context_scaler.pkl'


################### FINAL ################### (25m Added IMDb context)

test_split_data_path = 'eval_data/test_split_25ml_added_imdb_context.csv'
recsys_data_path = '../data/transform_data/25m/'
nn_model_path = '../model/global_recommending_model.keras'
scaler_file = '25m_added_imdb_context_scaler.pkl'


In [19]:
test_ratings = pd.read_csv(test_split_data_path)
test_ratings

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,43093,1923,3,1,2,4,3,1998,0,0,...,0,0,1,0,0,0,0,0,0,4.0
1,58800,57669,3,1,1,2,2,2008,0,0,...,0,0,0,0,0,0,1,0,0,4.5
2,134109,69075,5,1,2,4,3,1997,0,0,...,0,0,0,0,0,0,0,0,0,2.5
3,141503,1663,5,1,4,1,2,1981,0,0,...,0,0,0,0,0,0,0,1,0,4.5
4,147198,1136,4,1,4,4,2,1975,0,0,...,0,0,0,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2498342,107639,3977,6,0,3,4,2,2000,0,0,...,0,0,0,0,0,0,0,0,0,2.5
2498343,22136,2870,1,1,3,4,2,1967,0,0,...,0,0,0,0,0,0,0,0,0,4.0
2498344,162047,7883,7,0,1,3,2,1943,0,0,...,0,0,0,0,0,0,0,0,0,3.5
2498345,99479,54995,1,1,3,4,2,2007,0,0,...,0,0,0,1,0,0,0,0,0,3.0


In [20]:
# number of users to evaluate
n_users = 10
# number of recommendations to make
top_k = 10
# lowest rating to consider as a positive recommendation
low_rating = 4.0

# sort users by number of ratings
test_ratings_all_users = test_ratings['userId'].value_counts().index.tolist()[:n_users]

nn_model = load_model(nn_model_path, custom_objects={'RSquare': RSquare()}, compile=True)

In [21]:
def scale_data(data):
    # Load scaler
    scaler = joblib.load(recsys_data_path + scaler_file)

    # Return scaled data
    return scaler.transform(data)

In [22]:
print(f"Top {n_users} users IDs: {test_ratings_all_users}")

Top 10 users IDs: [72315, 137293, 80974, 33844, 20055, 92046, 49403, 109731, 115102, 75309]


In [None]:
actual = {}
predicted = {}

for user_id in test_ratings_all_users:

    user_test_ratings = test_ratings[test_ratings['userId'] == user_id]

    actual_ratings = user_test_ratings[user_test_ratings['rating'] >= low_rating]
    actual[user_id] = actual_ratings['movieId'].tolist()

    scaled_user_test_ratings = scale_data(user_test_ratings.drop(['rating'], axis=1))
    movie_indices = user_test_ratings['movieId'].values

    predictions = nn_model.predict(scaled_user_test_ratings, verbose=0).flatten()

    predicted_movies = pd.DataFrame({'movieId': movie_indices, 'rating': predictions})
    predicted_movies = predicted_movies.sort_values(by='rating', ascending=False)
    predicted[user_id] = predicted_movies['movieId'][:top_k].tolist()

----

In [26]:
# MRR
mrr_fin = mrr(actual, predicted)

# MAP@K
mapa3 = mapk(actual, predicted, 3)
mapa10 = mapk(actual, predicted, 10)

# NDCG@K
p = [value for value in predicted.values()]
a = [value for value in actual.values()]

ndcga5 = ndcg_at(p, a , 5)
ndcga10 = ndcg_at(p, a , 10)


In [27]:
print('MRR:     ', mrr_fin) 
print('MAP@3:   ', mapa3)
print('MAP@10:  ', mapa10)
print('NDCG@5:  ', ndcga5)  
print('NDCG@10: ', ndcga10)   

MRR:      1.0
MAP@3:    0.9
MAP@10:   0.7318015873015873
NDCG@5:   0.8468965
NDCG@10:  0.83545035
