In [13]:
from keras.models import load_model
import pandas as pd
import numpy as np
import joblib

### Mean Average Precision - MAP@k

In [14]:
# modified from:
# author: Ben Hamner
# author's github: benhamner
# link to github: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 

def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    apk_sum = 0.0
    for user in actual:
        if user in predicted:
            apk_sum += apk(actual[user], predicted[user], k)

    return apk_sum / len(actual)

### Mean Reciprocal Rank - MRR

In [15]:
def mrr(actual, predicted):
    mrr_sum = 0.0
    for user in actual:
        if user in predicted:
            rank = 1
            for movie in predicted[user]:
                if movie in actual[user]:
                    mrr_sum += 1.0 / rank
                    break
                rank += 1
    return mrr_sum / len(actual)

-----------

In [16]:
test_split_data_path = 'eval_data/test_split_25ml.csv'
# test_split_data_path = 'eval_data/test_split_profile_1.csv'
# test_split_data_path = 'eval_data/test_split_1m_added_imdb_context.csv'

In [17]:
test_ratings = pd.read_csv(test_split_data_path)
test_ratings

Unnamed: 0,userId,movieId,day,isWeekday,season,partOfDay,holiday,movieYear,titleType,isAdult,...,genreNews,genreReality-tv,genreRomance,genreSci-fi,genreShort,genreSport,genreThriller,genreWar,genreWestern,rating
0,43093,1923,3,1,2,4,summer_holiday,1998,movie,0,...,0,0,1,0,0,0,0,0,0,4.0
1,58800,57669,3,1,1,2,no_holiday,2008,movie,0,...,0,0,0,0,0,0,1,0,0,4.5
2,134109,69075,5,1,2,4,summer_holiday,1997,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
3,141503,1663,5,1,4,1,no_holiday,1981,movie,0,...,0,0,0,0,0,0,0,1,0,4.5
4,147198,1136,4,1,4,4,no_holiday,1975,movie,0,...,0,0,0,0,0,0,0,0,0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2498342,107639,3977,6,0,3,4,no_holiday,2000,movie,0,...,0,0,0,0,0,0,0,0,0,2.5
2498343,22136,2870,1,1,3,4,no_holiday,1967,movie,0,...,0,0,0,0,0,0,0,0,0,4.0
2498344,162047,7883,7,0,1,3,no_holiday,1943,movie,0,...,0,0,0,0,0,0,0,0,0,3.5
2498345,99479,54995,1,1,3,4,no_holiday,2007,movie,0,...,0,0,0,1,0,0,0,0,0,3.0


In [18]:
recsys_data_path = '../data/transform_data/25m/'
nn_model_path = '../model/arch8_25m_added_imdb_context_max_abs_scaler_pc_trained.keras'
scaler_file = '25m_added_imdb_context_scaler.pkl'
# target_scaler_file = '25m_added_imdb_context_target_scaler.pkl'

In [19]:
# number of users to evaluate
n_users = 10
# number of recommendations to make
top_k = 10
# lowest rating to consider as a positive recommendation
low_rating = 4.0

# sort users by number of ratings
test_ratings_all_users = test_ratings['userId'].value_counts().index.tolist()[:n_users]

nn_model = load_model(nn_model_path, compile=True)

In [20]:
def scale_data(data):
    actor_label_encoder = joblib.load(recsys_data_path + 'actor_label_encoder.pkl')
    directors_label_encoder = joblib.load(recsys_data_path + 'directors_label_encoder.pkl')
    holiday_label_encoder = joblib.load(recsys_data_path + 'holiday_label_encoder.pkl')
    titleType_label_encoder = joblib.load(recsys_data_path + 'titleType_label_encoder.pkl')

    # Load scaler
    scaler = joblib.load(recsys_data_path + scaler_file)

    # Label encode data
    data['actor'] = actor_label_encoder.transform(data['actor'])
    data['directors'] = directors_label_encoder.transform(data['directors'])
    data['holiday'] = holiday_label_encoder.transform(data['holiday'])
    data['titleType'] = titleType_label_encoder.transform(data['titleType'])

    # Return scaled data
    return scaler.transform(data)

In [21]:
actual = {}
predicted = {}

for user_id in test_ratings_all_users:

    user_test_ratings = test_ratings[test_ratings['userId'] == user_id]

    actual_ratings = user_test_ratings[user_test_ratings['rating'] >= low_rating]
    actual[user_id] = actual_ratings['movieId'].tolist()

    scaled_user_test_ratings = scale_data(user_test_ratings.drop(['rating'], axis=1))
    movie_indices = user_test_ratings['movieId'].values
    predictions = nn_model.predict(scaled_user_test_ratings, verbose=0).flatten()

    predicted_movies = pd.DataFrame({'movieId': movie_indices, 'rating': predictions})
    predicted_movies = predicted_movies.sort_values(by='rating', ascending=False)
    predicted[user_id] = predicted_movies['movieId'][:top_k].tolist()

In [22]:
# profile 1 - 72315 | profile 2 - 80974 | profile 3 - 107650

----

In [23]:
# MRR
mrr = mrr(actual, predicted)
mrr 

0.95

In [29]:
# MAP@K
mapak = mapk(actual, predicted, 10)
mapak

0.7134365079365079