In [19]:
from keras.models import load_model
import pandas as pd
import numpy as np
import joblib

### Mean Average Precision - MAP@k

In [2]:
# author: Ben Hamner
# author's github: benhamner
# link to github: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

### Mean Reciprocal Rank - MRR

In [3]:
# author's github: unsuthee
# link to github: https://github.com/unsuthee/VariationalDeepSemanticHashing/blob/master/rank_metrics.py

def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item

    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).

    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75

    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)

    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


def calculate_ranks(y_test, predicted):
    ranks = []
    for y_true in y_test:
        rank = [0] * len(predicted)
        if y_true in predicted:
            rank[predicted.index(y_true)] = predicted.index(y_true) + 1
        ranks.append(rank)
    return ranks

-----------

In [7]:
test_data = pd.read_csv('test_split_25ml.csv')
# put index as the first column
test_data = test_data.reset_index()
test_data

Unnamed: 0,index,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,0,43093,1923,3,1,2,4,summer_holiday,1998,movie,...,0,0,1,0,0,0,0,0,0,4.0
1,1,58800,57669,3,1,1,2,no_holiday,2008,movie,...,0,0,0,0,0,0,1,0,0,4.5
2,2,134109,69075,5,1,2,4,summer_holiday,1997,movie,...,0,0,0,0,0,0,0,0,0,2.5
3,3,141503,1663,5,1,4,1,no_holiday,1981,movie,...,0,0,0,0,0,0,0,1,0,4.5
4,4,147198,1136,4,1,4,4,no_holiday,1975,movie,...,0,0,0,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2498342,2498342,107639,3977,6,0,3,4,no_holiday,2000,movie,...,0,0,0,0,0,0,0,0,0,2.5
2498343,2498343,22136,2870,1,1,3,4,no_holiday,1967,movie,...,0,0,0,0,0,0,0,0,0,4.0
2498344,2498344,162047,7883,7,0,1,3,no_holiday,1943,movie,...,0,0,0,0,0,0,0,0,0,3.5
2498345,2498345,99479,54995,1,1,3,4,no_holiday,2007,movie,...,0,0,0,1,0,0,0,0,0,3.0


In [8]:
users_to_eval = pd.read_csv('user_ids_in_train_test_split_25ml.csv')
users_to_eval = users_to_eval.sort_values(by='testCount', ascending=False)
users_to_eval

Unnamed: 0,userId,trainCount,testCount
57,72315,25518,3312
34,137293,7110,929
35,80974,7392,898
23,33844,6353,847
41,20055,5955,739
7,92046,5190,694
50,49403,5169,690
14,109731,5322,667
11,115102,4459,590
18,75309,4413,563


In [9]:
user_id = 72315
top_k = 10

user_test_ratings = test_data[test_data['userId'] == user_id]

test_y_all = pd.DataFrame()
rating_grades = [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]

for rating_grade in rating_grades:
    test_y_all = pd.concat([test_y_all, user_test_ratings[user_test_ratings['rating'] == rating_grade].head(1)])

In [33]:
test_y_all = test_y_all.sort_values(by='rating', ascending=False)
test_y_all

Unnamed: 0,index,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
19541,19541,72315,201416,6,0,2,3,summer_holiday,1985,tvMovie,...,0,0,0,0,0,0,0,0,0,5.0
29809,29809,72315,4710,2,1,4,4,no_holiday,1976,movie,...,0,0,0,0,0,0,0,0,1,4.5
235,235,72315,77455,2,1,4,1,no_holiday,2010,movie,...,0,0,0,0,0,0,0,0,0,4.0
4673,4673,72315,192047,6,0,2,3,summer_holiday,2005,movie,...,0,0,0,0,0,0,0,0,0,3.5
2793,2793,72315,141008,2,1,4,1,no_holiday,2012,movie,...,0,0,0,0,0,0,0,0,0,3.0
3294,3294,72315,182073,6,0,2,3,summer_holiday,1939,short,...,0,0,0,0,0,0,0,0,0,2.5
8697,8697,72315,90549,4,1,2,1,summer_holiday,2008,movie,...,0,0,1,0,0,0,0,1,0,2.0
40401,40401,72315,58411,5,1,4,1,no_holiday,2006,movie,...,0,0,0,0,0,0,0,0,0,1.5
23472,23472,72315,162926,5,1,2,1,summer_holiday,2005,movie,...,0,0,0,0,0,0,0,0,0,1.0
63841,63841,72315,78941,2,1,4,4,no_holiday,2006,movie,...,0,0,0,0,0,0,0,0,0,0.5


In [34]:
test_y = test_y_all[['movieId', 'rating']]
test_y

Unnamed: 0,movieId,rating
19541,201416,5.0
29809,4710,4.5
235,77455,4.0
4673,192047,3.5
2793,141008,3.0
3294,182073,2.5
8697,90549,2.0
40401,58411,1.5
23472,162926,1.0
63841,78941,0.5


In [58]:
# create test list of movieIds
test_y_list = test_y['movieId'].tolist()
test_y_list

[201416, 4710, 77455, 192047, 141008, 182073, 90549, 58411, 162926, 78941]

Create predictions for the test set

In [36]:
pred_y_all = test_y_all.drop(['rating', 'index'], axis=1)
pred_y_all

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreMystery,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern
19541,72315,201416,6,0,2,3,summer_holiday,1985,tvMovie,0,...,0,0,0,0,0,0,0,0,0,0
29809,72315,4710,2,1,4,4,no_holiday,1976,movie,0,...,0,0,0,0,0,0,0,0,0,1
235,72315,77455,2,1,4,1,no_holiday,2010,movie,0,...,0,0,0,0,0,0,0,0,0,0
4673,72315,192047,6,0,2,3,summer_holiday,2005,movie,0,...,0,0,0,0,0,0,0,0,0,0
2793,72315,141008,2,1,4,1,no_holiday,2012,movie,0,...,0,0,0,0,0,0,0,0,0,0
3294,72315,182073,6,0,2,3,summer_holiday,1939,short,0,...,0,0,0,0,0,0,0,0,0,0
8697,72315,90549,4,1,2,1,summer_holiday,2008,movie,0,...,0,0,0,1,0,0,0,0,1,0
40401,72315,58411,5,1,4,1,no_holiday,2006,movie,0,...,0,0,0,0,0,0,0,0,0,0
23472,72315,162926,5,1,2,1,summer_holiday,2005,movie,0,...,0,0,0,0,0,0,0,0,0,0
63841,72315,78941,2,1,4,4,no_holiday,2006,movie,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
recsys_data_path = '../data/transform_data/'
nn_model_path = '../model/arch8_25m_added_imdb_context_max_abs_scaler_pc_trained.keras'

In [37]:
actor_label_encoder = joblib.load(recsys_data_path + 'actor_label_encoder.pkl')
directors_label_encoder = joblib.load(recsys_data_path + 'directors_label_encoder.pkl')
holiday_label_encoder = joblib.load(recsys_data_path + 'holiday_label_encoder.pkl')
titleType_label_encoder = joblib.load(recsys_data_path + 'titleType_label_encoder.pkl')

# Load scaler
scaler = joblib.load(recsys_data_path + '25m_added_imdb_context_scaler.pkl')

# Label encode data
pred_y_all['actor'] = actor_label_encoder.transform(pred_y_all['actor'])
pred_y_all['directors'] = directors_label_encoder.transform(pred_y_all['directors'])
pred_y_all['holiday'] = holiday_label_encoder.transform(pred_y_all['holiday'])
pred_y_all['titleType'] = titleType_label_encoder.transform(pred_y_all['titleType'])

# Scale data
new_data = scaler.transform(pred_y_all)

In [39]:
nn_model = load_model(nn_model_path, compile=True)
# nn_model = load_model(model_path, compile=True)

predictions = nn_model.predict(new_data, verbose=0)
predictions

array([[2.9605534],
       [3.8560395],
       [3.8089442],
       [3.462491 ],
       [3.4672904],
       [3.682148 ],
       [3.2377586],
       [3.398469 ],
       [3.5361023],
       [3.4401622]], dtype=float32)

In [43]:
pred_y_all['pred_y'] = predictions
pred_y = pred_y_all[['movieId', 'pred_y']]
pred_y = pred_y.sort_values(by='pred_y', ascending=False)
pred_y

Unnamed: 0,movieId,pred_y
29809,4710,3.85604
235,77455,3.808944
3294,182073,3.682148
23472,162926,3.536102
2793,141008,3.46729
4673,192047,3.462491
63841,78941,3.440162
40401,58411,3.398469
8697,90549,3.237759
19541,201416,2.960553


In [55]:
pred_y_list = pred_y['movieId'].tolist()
pred_y_list

[4710, 77455, 182073, 162926, 141008, 192047, 78941, 58411, 90549, 201416]

----

In [68]:
print('Test movieIds:', test_y_list)
print('Predicted movieIds:', pred_y_list)

Test movieIds: [201416, 4710, 77455, 192047, 141008, 182073, 90549, 58411, 162926, 78941]
Predicted movieIds: [4710, 77455, 182073, 162926, 141008, 192047, 78941, 58411, 90549, 201416]


In [69]:
test_y_list_str = [str(item) for item in test_y_list]
pred_y_list_str = [str(item) for item in pred_y_list]

In [73]:
# MRR
mrr = mean_reciprocal_rank(calculate_ranks(test_y_list_str[:5], pred_y_list_str[:5]))
mrr

0.33999999999999997

In [72]:
# MAP@K
mapak = mapk(test_y_list_str, pred_y_list_str, 5)
mapak

0.379