In [23]:
from keras.models import load_model
import pandas as pd
import numpy as np
import joblib

### Mean Average Precision - MAP@k

In [24]:
# author: Ben Hamner
# author's github: benhamner
# link to github: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

### Mean Reciprocal Rank - MRR

In [25]:
# author's github: unsuthee
# link to github: https://github.com/unsuthee/VariationalDeepSemanticHashing/blob/master/rank_metrics.py

def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item

    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).

    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def calculate_ranks(y_test, predicted):
    ranks = []
    for y_true in y_test:
        rank = [0] * len(predicted)
        if y_true in predicted:
            rank[predicted.index(y_true)] = predicted.index(y_true) + 1
        ranks.append(rank)
    return ranks

-----------

In [26]:
# test_split_path = 'eval_data/test_split_25ml.csv'
test_split_path = 'eval_data/test_split_profile_1.csv'

# test_split_path = 'eval_data/test_split_1m_added_imdb_context.csv'

In [27]:
test_data = pd.read_csv(test_split_path)
# put index as the first column
test_data = test_data.reset_index()
test_data

Unnamed: 0,index,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,0,72315,183355,6,0,2,3,3,1971,0,...,0,0,0,0,0,0,0,0,0,3.5
1,1,72315,103790,2,1,4,1,2,2001,4,...,0,0,0,0,0,0,0,0,0,3.5
2,2,72315,3129,2,1,4,1,2,1999,0,...,0,0,0,0,0,0,0,0,0,3.5
3,3,72315,43921,5,1,3,1,2,2006,0,...,0,0,0,0,0,0,1,0,0,3.5
4,4,72315,89864,2,1,4,1,2,2011,0,...,0,0,0,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3213,3213,72315,155617,6,0,1,2,2,1980,0,...,0,0,0,0,0,0,0,0,0,3.5
3214,3214,72315,49007,4,1,2,1,3,1966,0,...,0,0,1,0,0,0,1,0,0,3.0
3215,3215,72315,80076,6,0,4,4,4,2003,0,...,0,0,1,0,0,0,0,0,0,2.5
3216,3216,72315,143043,6,0,4,1,1,2013,0,...,0,0,0,0,0,0,0,0,0,2.5


In [28]:
# users_to_eval = pd.read_csv('user_ids_in_train_test_split_25ml.csv')
# users_to_eval = users_to_eval.sort_values(by='testCount', ascending=False)
# users_to_eval

In [29]:
# profile 1 - 72315 | profile 2 - 80974 | profile 3 - 107650

user_id = 72315
top_k = 10
number_of_each_rating = 2

user_test_ratings = test_data[test_data['userId'] == user_id]

test_y_all = pd.DataFrame()
rating_grades = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]

for rating_grade in rating_grades:
    test_y_all = pd.concat([test_y_all, user_test_ratings[user_test_ratings['rating'] == rating_grade].head(number_of_each_rating)])

In [30]:
test_y_all = test_y_all.sort_values(by='rating', ascending=False)
test_y_all

Unnamed: 0,index,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
148,148,72315,69498,5,1,4,2,2,1950,0,...,0,0,1,0,0,0,0,0,0,5.0
129,129,72315,104423,6,0,4,4,4,2011,0,...,0,0,0,0,0,0,0,0,0,5.0
83,83,72315,111732,5,1,4,1,2,2013,0,...,0,0,0,0,0,0,0,0,0,4.5
31,31,72315,1225,6,0,4,4,4,1984,0,...,0,0,0,0,0,0,0,0,0,4.5
52,52,72315,4381,5,1,4,2,2,2001,0,...,0,0,0,0,0,0,0,0,0,4.0
32,32,72315,137494,2,1,4,4,2,1995,0,...,0,0,0,0,0,0,0,0,0,4.0
1,1,72315,103790,2,1,4,1,2,2001,4,...,0,0,0,0,0,0,0,0,0,3.5
0,0,72315,183355,6,0,2,3,3,1971,0,...,0,0,0,0,0,0,0,0,0,3.5
6,6,72315,91110,2,1,4,1,2,1976,0,...,0,0,1,0,0,0,1,0,0,3.0
5,5,72315,88175,3,1,3,1,2,1941,0,...,0,0,0,0,0,0,0,0,0,3.0


In [31]:
test_y = test_y_all[['movieId', 'rating']]
test_y

Unnamed: 0,movieId,rating
148,69498,5.0
129,104423,5.0
83,111732,4.5
31,1225,4.5
52,4381,4.0
32,137494,4.0
1,103790,3.5
0,183355,3.5
6,91110,3.0
5,88175,3.0


In [32]:
# create test list of movieIds
test_y_list = test_y['movieId'].tolist()
test_y_list

[69498,
 104423,
 111732,
 1225,
 4381,
 137494,
 103790,
 183355,
 91110,
 88175,
 194686,
 205351,
 25910,
 118326,
 118874,
 47503,
 134045,
 42736,
 127132,
 96030]

Create predictions for the test set

In [33]:
pred_y_all = test_y_all.drop(['rating', 'index'], axis=1)
pred_y_all

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreMystery,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern
148,72315,69498,5,1,4,2,2,1950,0,0,...,0,0,0,1,0,0,0,0,0,0
129,72315,104423,6,0,4,4,4,2011,0,0,...,0,0,0,0,0,0,0,0,0,0
83,72315,111732,5,1,4,1,2,2013,0,0,...,0,0,0,0,0,0,0,0,0,0
31,72315,1225,6,0,4,4,4,1984,0,0,...,0,0,0,0,0,0,0,0,0,0
52,72315,4381,5,1,4,2,2,2001,0,0,...,0,0,0,0,0,0,0,0,0,0
32,72315,137494,2,1,4,4,2,1995,0,0,...,0,0,0,0,0,0,0,0,0,0
1,72315,103790,2,1,4,1,2,2001,4,0,...,0,0,0,0,0,0,0,0,0,0
0,72315,183355,6,0,2,3,3,1971,0,0,...,0,0,0,0,0,0,0,0,0,0
6,72315,91110,2,1,4,1,2,1976,0,0,...,0,0,0,1,0,0,0,1,0,0
5,72315,88175,3,1,3,1,2,1941,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
recsys_data_path = '../data/transform_data/profile_1/'
nn_model_path = '../model/arch8_25m_profile_1_trained.keras'
scaler_file = 'scaler_profile_1.pkl'

# recsys_data_path = '../data/transform_data/25m/'
# # nn_model_path = '../model/arch10_25m_added_imdb_context_max_abs_scaler_gn_trained.keras'
# scaler_file = '25m_added_imdb_context_scaler.pkl'
# target_scaler_file = '25m_added_imdb_context_target_scaler.pkl'

In [35]:
actor_label_encoder = joblib.load(recsys_data_path + 'actor_label_encoder.pkl')
directors_label_encoder = joblib.load(recsys_data_path + 'directors_label_encoder.pkl')
holiday_label_encoder = joblib.load(recsys_data_path + 'holiday_label_encoder.pkl')
titleType_label_encoder = joblib.load(recsys_data_path + 'titleType_label_encoder.pkl')

# Load scaler
scaler = joblib.load(recsys_data_path + scaler_file)

# Label encode data
pred_y_all['actor'] = actor_label_encoder.transform(pred_y_all['actor'])
pred_y_all['directors'] = directors_label_encoder.transform(pred_y_all['directors'])
pred_y_all['holiday'] = holiday_label_encoder.transform(pred_y_all['holiday'])
pred_y_all['titleType'] = titleType_label_encoder.transform(pred_y_all['titleType'])

# Scale data
new_data = scaler.transform(pred_y_all)

In [36]:
nn_model = load_model(nn_model_path, compile=True)
# nn_model = load_model(model_path, compile=True)

predictions = nn_model.predict(new_data, verbose=0)
predictions

array([[3.3549812],
       [3.2663677],
       [3.2581594],
       [3.7270658],
       [3.36773  ],
       [3.1885107],
       [3.4933398],
       [2.6383853],
       [3.240156 ],
       [2.9584968],
       [2.853392 ],
       [3.054798 ],
       [3.0053046],
       [2.8471167],
       [2.8725727],
       [3.3886406],
       [3.1240394],
       [2.9093726],
       [3.01319  ],
       [2.6294105]], dtype=float32)

In [37]:
pred_y_all['pred_y'] = predictions
pred_y = pred_y_all[['movieId', 'pred_y']]
pred_y = pred_y.sort_values(by='pred_y', ascending=False)
pred_y

Unnamed: 0,movieId,pred_y
31,1225,3.727066
1,103790,3.49334
22,47503,3.388641
52,4381,3.36773
148,69498,3.354981
129,104423,3.266368
83,111732,3.258159
6,91110,3.240156
32,137494,3.188511
112,134045,3.124039


In [38]:
pred_y_list = pred_y['movieId'].tolist()
pred_y_list

[1225,
 103790,
 47503,
 4381,
 69498,
 104423,
 111732,
 91110,
 137494,
 134045,
 205351,
 127132,
 25910,
 88175,
 42736,
 118874,
 194686,
 118326,
 183355,
 96030]

----

In [39]:
print('Test movieIds:', test_y_list)
print('Predicted movieIds:', pred_y_list)

Test movieIds: [69498, 104423, 111732, 1225, 4381, 137494, 103790, 183355, 91110, 88175, 194686, 205351, 25910, 118326, 118874, 47503, 134045, 42736, 127132, 96030]
Predicted movieIds: [1225, 103790, 47503, 4381, 69498, 104423, 111732, 91110, 137494, 134045, 205351, 127132, 25910, 88175, 42736, 118874, 194686, 118326, 183355, 96030]


In [40]:
test_y_list_str = [str(item) for item in test_y_list]
pred_y_list_str = [str(item) for item in pred_y_list]

In [41]:
# MRR
mrr = mean_reciprocal_rank(calculate_ranks(test_y_list_str[:7], pred_y_list_str[:7]))
# mrr = mean_reciprocal_rank(calculate_ranks(test_y_list_str, pred_y_list_str))
mrr

0.3227891156462585

In [42]:
# MAP@K
mapak = mapk(test_y_list_str, pred_y_list_str, 3)
mapak

0.39444444444444443