In [316]:
from keras.models import load_model
import pandas as pd
import numpy as np
import joblib

### Mean Average Precision - MAP@k

In [317]:
# author: Ben Hamner
# author's github: benhamner
# link to github: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

### Mean Reciprocal Rank - MRR

In [318]:
# author's github: unsuthee
# link to github: https://github.com/unsuthee/VariationalDeepSemanticHashing/blob/master/rank_metrics.py

def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item

    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).

    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def calculate_ranks(y_test, predicted):
    ranks = []
    for y_true in y_test:
        rank = [0] * len(predicted)
        if y_true in predicted:
            rank[predicted.index(y_true)] = predicted.index(y_true) + 1
        ranks.append(rank)
    return ranks

-----------

In [319]:
# test_split_path = 'eval_data/test_split_25ml.csv'
# test_split_path = 'eval_data/test_split_profile_1.csv'

test_split_path = 'eval_data/test_split_1m_added_imdb_context.csv'

In [320]:
test_data = pd.read_csv(test_split_path)
# put index as the first column
test_data = test_data.reset_index()
test_data

Unnamed: 0,index,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,0,43093,1923,3,1,2,4,summer_holiday,1998,movie,...,0,0,1,0,0,0,0,0,0,4.0
1,1,58800,57669,3,1,1,2,no_holiday,2008,movie,...,0,0,0,0,0,0,1,0,0,4.5
2,2,134109,69075,5,1,2,4,summer_holiday,1997,movie,...,0,0,0,0,0,0,0,0,0,2.5
3,3,141503,1663,5,1,4,1,no_holiday,1981,movie,...,0,0,0,0,0,0,0,1,0,4.5
4,4,147198,1136,4,1,4,4,no_holiday,1975,movie,...,0,0,0,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2498342,2498342,107639,3977,6,0,3,4,no_holiday,2000,movie,...,0,0,0,0,0,0,0,0,0,2.5
2498343,2498343,22136,2870,1,1,3,4,no_holiday,1967,movie,...,0,0,0,0,0,0,0,0,0,4.0
2498344,2498344,162047,7883,7,0,1,3,no_holiday,1943,movie,...,0,0,0,0,0,0,0,0,0,3.5
2498345,2498345,99479,54995,1,1,3,4,no_holiday,2007,movie,...,0,0,0,1,0,0,0,0,0,3.0


In [321]:
# users_to_eval = pd.read_csv('user_ids_in_train_test_split_25ml.csv')
# users_to_eval = users_to_eval.sort_values(by='testCount', ascending=False)
# users_to_eval

In [353]:
# profile 1 - 72315 | profile 2 - 80974 | profile 3 - 107650

user_id = 4169
top_k = 10
number_of_each_rating = 2

user_test_ratings = test_data[test_data['userId'] == user_id]

test_y_all = pd.DataFrame()
rating_grades = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]

for rating_grade in rating_grades:
    test_y_all = pd.concat([test_y_all, user_test_ratings[user_test_ratings['rating'] == rating_grade].head(number_of_each_rating)])

In [354]:
test_y_all = test_y_all.sort_values(by='rating', ascending=False)
test_y_all

Unnamed: 0,index,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
23378,23378,72315,8188,5,1,4,2,no_holiday,1954,movie,...,0,0,0,0,0,0,0,0,0,5.0
19541,19541,72315,201416,6,0,2,3,summer_holiday,1985,tvMovie,...,0,0,0,0,0,0,0,0,0,5.0
42464,42464,72315,428,2,1,4,4,no_holiday,1993,movie,...,0,0,0,0,0,0,0,0,0,4.5
29809,29809,72315,4710,2,1,4,4,no_holiday,1976,movie,...,0,0,0,0,0,0,0,0,1,4.5
9805,9805,72315,170361,4,1,2,2,summer_holiday,2016,movie,...,0,0,0,0,0,0,0,0,0,4.0
235,235,72315,77455,2,1,4,1,no_holiday,2010,movie,...,0,0,0,0,0,0,0,0,0,4.0
13708,13708,72315,161586,6,0,4,1,christmas,2016,movie,...,0,0,0,0,0,0,0,0,0,3.5
4673,4673,72315,192047,6,0,2,3,summer_holiday,2005,movie,...,0,0,0,0,0,0,0,0,0,3.5
3655,3655,72315,6728,2,1,4,1,no_holiday,1963,movie,...,0,0,0,0,0,0,0,0,0,3.0
2793,2793,72315,141008,2,1,4,1,no_holiday,2012,movie,...,0,0,0,0,0,0,0,0,0,3.0


In [355]:
test_y = test_y_all[['movieId', 'rating']]
test_y

Unnamed: 0,movieId,rating
23378,8188,5.0
19541,201416,5.0
42464,428,4.5
29809,4710,4.5
9805,170361,4.0
235,77455,4.0
13708,161586,3.5
4673,192047,3.5
3655,6728,3.0
2793,141008,3.0


In [356]:
# create test list of movieIds
test_y_list = test_y['movieId'].tolist()
test_y_list

[8188,
 201416,
 428,
 4710,
 170361,
 77455,
 161586,
 192047,
 6728,
 141008,
 155709,
 182073,
 25884,
 90549,
 196,
 58411,
 173081,
 162926,
 6514,
 78941]

Create predictions for the test set

In [357]:
pred_y_all = test_y_all.drop(['rating', 'index'], axis=1)
pred_y_all

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreMystery,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern
23378,72315,8188,5,1,4,2,no_holiday,1954,movie,0,...,0,0,0,0,0,0,0,0,0,0
19541,72315,201416,6,0,2,3,summer_holiday,1985,tvMovie,0,...,0,0,0,0,0,0,0,0,0,0
42464,72315,428,2,1,4,4,no_holiday,1993,movie,0,...,0,0,0,0,0,0,0,0,0,0
29809,72315,4710,2,1,4,4,no_holiday,1976,movie,0,...,0,0,0,0,0,0,0,0,0,1
9805,72315,170361,4,1,2,2,summer_holiday,2016,movie,0,...,0,0,0,0,0,0,0,0,0,0
235,72315,77455,2,1,4,1,no_holiday,2010,movie,0,...,0,0,0,0,0,0,0,0,0,0
13708,72315,161586,6,0,4,1,christmas,2016,movie,0,...,0,0,0,0,0,0,0,0,0,0
4673,72315,192047,6,0,2,3,summer_holiday,2005,movie,0,...,0,0,0,0,0,0,0,0,0,0
3655,72315,6728,2,1,4,1,no_holiday,1963,movie,0,...,0,0,0,0,0,0,0,0,0,0
2793,72315,141008,2,1,4,1,no_holiday,2012,movie,0,...,0,0,0,0,0,0,0,0,0,0


In [358]:
# recsys_data_path = '../data/transform_data/profile_1/'
# nn_model_path = '../model/arch8_25m_profile_1_trained.keras'
# scaler_file = 'scaler_profile_1.pkl'

recsys_data_path = '../data/transform_data/25m/'
nn_model_path = '../model/arch8_25m_added_imdb_context_max_abs_scaler_pc_trained.keras'
scaler_file = '25m_added_imdb_context_scaler.pkl'

In [359]:
actor_label_encoder = joblib.load(recsys_data_path + 'actor_label_encoder.pkl')
directors_label_encoder = joblib.load(recsys_data_path + 'directors_label_encoder.pkl')
holiday_label_encoder = joblib.load(recsys_data_path + 'holiday_label_encoder.pkl')
titleType_label_encoder = joblib.load(recsys_data_path + 'titleType_label_encoder.pkl')

# Load scaler
scaler = joblib.load(recsys_data_path + scaler_file)

# Label encode data
pred_y_all['actor'] = actor_label_encoder.transform(pred_y_all['actor'])
pred_y_all['directors'] = directors_label_encoder.transform(pred_y_all['directors'])
pred_y_all['holiday'] = holiday_label_encoder.transform(pred_y_all['holiday'])
pred_y_all['titleType'] = titleType_label_encoder.transform(pred_y_all['titleType'])

# Scale data
new_data = scaler.transform(pred_y_all)

In [360]:
nn_model = load_model(nn_model_path, compile=True)
# nn_model = load_model(model_path, compile=True)

predictions = nn_model.predict(new_data, verbose=0)
predictions

array([[3.5318031],
       [2.9605534],
       [3.6920319],
       [3.8560395],
       [3.489362 ],
       [3.8089442],
       [3.297967 ],
       [3.462491 ],
       [3.6121032],
       [3.4672904],
       [3.196197 ],
       [3.682148 ],
       [3.0646727],
       [3.2377586],
       [2.8118806],
       [3.398469 ],
       [2.487362 ],
       [3.5361023],
       [2.342158 ],
       [3.4401622]], dtype=float32)

In [361]:
pred_y_all['pred_y'] = predictions
pred_y = pred_y_all[['movieId', 'pred_y']]
pred_y = pred_y.sort_values(by='pred_y', ascending=False)
pred_y

Unnamed: 0,movieId,pred_y
29809,4710,3.85604
235,77455,3.808944
42464,428,3.692032
3294,182073,3.682148
3655,6728,3.612103
23472,162926,3.536102
23378,8188,3.531803
9805,170361,3.489362
2793,141008,3.46729
4673,192047,3.462491


In [362]:
pred_y_list = pred_y['movieId'].tolist()
pred_y_list

[4710,
 77455,
 428,
 182073,
 6728,
 162926,
 8188,
 170361,
 141008,
 192047,
 78941,
 58411,
 161586,
 90549,
 155709,
 25884,
 201416,
 196,
 173081,
 6514]

----

In [363]:
print('Test movieIds:', test_y_list)
print('Predicted movieIds:', pred_y_list)

Test movieIds: [8188, 201416, 428, 4710, 170361, 77455, 161586, 192047, 6728, 141008, 155709, 182073, 25884, 90549, 196, 58411, 173081, 162926, 6514, 78941]
Predicted movieIds: [4710, 77455, 428, 182073, 6728, 162926, 8188, 170361, 141008, 192047, 78941, 58411, 161586, 90549, 155709, 25884, 201416, 196, 173081, 6514]


In [364]:
test_y_list_str = [str(item) for item in test_y_list]
pred_y_list_str = [str(item) for item in pred_y_list]

In [367]:
# MRR
mrr = mean_reciprocal_rank(calculate_ranks(test_y_list_str[:7], pred_y_list_str[:7]))
# mrr = mean_reciprocal_rank(calculate_ranks(test_y_list_str, pred_y_list_str))
mrr

0.28231292517006806

In [369]:
# MAP@K
mapak = mapk(test_y_list_str, pred_y_list_str, 3)
mapak

0.425