In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
target_user_id = 1
number_of_top_similarities = 2

In [3]:
# depth of this file in the project
file_depth = '../..'

user_similarities_file = f"rating_user_data/similarities_to_user_{target_user_id}.json"
save_sets_of_categories_file = 'rating_user_data/sets_of_categories_for_each_user.json'

with open(user_similarities_file, 'r') as f:
    similarities = json.load(f)

with open(save_sets_of_categories_file, 'r') as f:
    sets_of_categories = json.load(f)

In [4]:
similarities[0]['2']

0.5477225575051662

In [5]:
sets_of_categories[0]['2']

{'Animation': 0,
 'ForChildren': 0,
 'Comedy': 3,
 'Adventure': 0,
 'Fantasy': 0,
 'Romance': 1,
 'Drama': 2,
 'Action': 2,
 'Crime': 0,
 'Thriller': 2,
 'Horror': 0,
 'Sci-Fi': 1,
 'Documentary': 0,
 'War': 1,
 'Musical': 0,
 'Mystery': 0,
 'Film-Noir': 0,
 'Western': 0}

In [6]:
# load all the splits into a list
splits = []

for file in os.listdir('rating_user_data/splits/'):
    if file.endswith('.csv'):
        split = pd.read_csv('rating_user_data/splits/' + file)
        splits.append(split)

In [7]:
splits[0]

Unnamed: 0,user_id,movie_id,rating,timestamp,genres
0,1671,3342,5,974713449,Drama|War
1,2569,3929,4,973911951,Comedy
2,3747,1639,4,966138457,Drama|Romance
3,3821,3702,4,974759703,Action|Sci-Fi
4,5173,3147,5,961886251,Drama|Thriller
...,...,...,...,...,...
47995,1880,1028,4,975384642,ForChildren|Comedy|Musical
47996,2153,1219,5,974623040,Horror|Thriller
47997,5767,1580,4,958192067,Action|Adventure|Comedy|Sci-Fi
47998,4217,2458,4,965319593,Comedy|Crime


In [8]:
top_similarities = []

for similarity in similarities:
    sorted_similarities = sorted(similarity.items(), key=lambda item: item[1], reverse=True)
    top_similarities.append(sorted_similarities[:number_of_top_similarities])

In [9]:
top_similarities

[[('413', 0.9797958971132712), ('2831', 0.9689627902499088)],
 [('3619', 1), ('4540', 0.9707253433941511)],
 [('764', 0.9394421721142376), ('890', 0.9132483825423161)],
 [('1481', 0.9648363026488435), ('320', 0.9438798074485388)],
 [('2003', 1), ('5299', 1)],
 [('1092', 0.9566807697649699), ('4603', 0.938083151964686)],
 [('3560', 0.9432422182837987), ('2957', 0.9045340337332909)],
 [('993', 0.890870806374748), ('980', 0.8660254037844387)],
 [('333', 0.8944271909999159), ('885', 0.8888888888888888)],
 [('134', 0.8944271909999159), ('2379', 0.8944271909999159)],
 [('1359', 0.9486832980505138), ('2444', 0.9486832980505138)],
 [('707', 1), ('4772', 0.9669875568304563)]]

----------

In [10]:
def count_categories_occurrence_in_dataframe(ratings):
    categories = {}
    for index, row in ratings.iterrows():
        for category in row['genres'].split('|'):
            if category in categories:
                categories[category] += 1
            else:
                categories[category] = 1
    return categories


def get_top_n_occurrences(category_occurrence, top):
    keys_with_max_value = []
    if top is None:
        max_value = max(category_occurrence.values())
        for key, value in category_occurrence.items():
            if value == max_value:
                keys_with_max_value.append(key)
    else:
        for item in range(top):
            max_value = max(category_occurrence.values())
            for key, value in category_occurrence.items():
                if value == max_value:
                    keys_with_max_value.append(key)
                    category_occurrence.pop(key)
                    break
    return keys_with_max_value


In [11]:
def make_recommendation(train_splits, number_of_top_similarities, number_of_newest_ratings, top_n): 
    recommended_genres = []

    # for each split except the last one (the last one is the test set)
    for index, split in enumerate(train_splits):

        # get the user_id of the user that is most similar to the target user
        # top_similarities[][][] -> [index of the split][index of the similarity][0 -> user_id, 1 -> similarity]
        user_x = int(number_of_top_similarities[index][0][0])

        # get the 5 newest ratings of the user that is most similar to the target user
        # sort by timestamp in descending order
        user_x_ratings = split.loc[split['user_id'] == user_x].sort_values(by='timestamp', ascending=False)[:number_of_newest_ratings]

        categories_occurrence = count_categories_occurrence_in_dataframe(user_x_ratings)

        if top_n is None:
            keys_with_max_value = get_top_n_occurrences(categories_occurrence, None)
        else:
            keys_with_max_value = get_top_n_occurrences(categories_occurrence, top_n)
        
        recommended_genres.append(keys_with_max_value)

    return recommended_genres

In [12]:
number_of_newest_ratings = 5

recommended_genres = make_recommendation(splits[:-1], top_similarities, number_of_newest_ratings, None)
recommended_genres

[['Drama'],
 ['Drama'],
 ['Fantasy', 'Sci-Fi', 'Action', 'Adventure', 'Drama'],
 ['Drama'],
 ['ForChildren'],
 ['Animation', 'Musical'],
 ['Drama'],
 ['Comedy'],
 ['ForChildren', 'Musical'],
 ['Action'],
 ['Animation', 'ForChildren']]

In [13]:
# target user
sets_of_categories[0]['1']

{'Animation': 3,
 'ForChildren': 3,
 'Comedy': 3,
 'Adventure': 0,
 'Fantasy': 0,
 'Romance': 1,
 'Drama': 4,
 'Action': 0,
 'Crime': 0,
 'Thriller': 0,
 'Horror': 0,
 'Sci-Fi': 0,
 'Documentary': 0,
 'War': 0,
 'Musical': 1,
 'Mystery': 0,
 'Film-Noir': 0,
 'Western': 0}

In [14]:
# most similar user
sets_of_categories[0]['413']

{'Animation': 2,
 'ForChildren': 2,
 'Comedy': 2,
 'Adventure': 0,
 'Fantasy': 0,
 'Romance': 1,
 'Drama': 4,
 'Action': 0,
 'Crime': 0,
 'Thriller': 0,
 'Horror': 0,
 'Sci-Fi': 0,
 'Documentary': 0,
 'War': 0,
 'Musical': 1,
 'Mystery': 0,
 'Film-Noir': 0,
 'Western': 0}

--------------------------
### Evaluation of the model
--------------------------

Mean Average Precision MAP@k

In [15]:
# author: Ben Hamner
# author's github: benhamner
# link to github: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py 

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [16]:
def count_categories_occurrence_in_list(recommended_categories):
    categories = {}
    for split in recommended_categories:
        for category in split:
            if category in categories:
                categories[category] += 1
            else:
                categories[category] = 1
    return categories

In [17]:
top_n_to_test = [1, 2, 3, 5, 7]

for index, top_n in enumerate(top_n_to_test):
    print('=================================================')
    print(f"========== Mean Average Precision @K={top_n} ==========\n")

    category_count = count_categories_occurrence_in_list(recommended_genres)
    top_n_recommended_categories = get_top_n_occurrences(category_count, top_n)
    print('Top recommended categories in the training set:')
    print(f"    {top_n_recommended_categories}")

    test_split = []
    test_split.append(splits[-1])

    test_recommended_genres = make_recommendation(test_split, top_similarities, number_of_newest_ratings, top_n)
    print('Top recommended categories in the testing set:')
    print(f"    {test_recommended_genres[0]}")

    mean_average_precision_at_k  = mapk(test_recommended_genres[0], top_n_recommended_categories, top_n)
    print(f"\nMean Average Precision @K={top_n}: {mean_average_precision_at_k:.3f}\n")


Top recommended categories in the training set:
    ['Drama']
Top recommended categories in the testing set:
    ['Drama']

Mean Average Precision @K=1: 1.000


Top recommended categories in the training set:
    ['Drama', 'ForChildren']
Top recommended categories in the testing set:
    ['Drama', 'Animation']

Mean Average Precision @K=2: 0.625


Top recommended categories in the training set:
    ['Drama', 'ForChildren', 'Action']
Top recommended categories in the testing set:
    ['Drama', 'Animation', 'ForChildren']

Mean Average Precision @K=3: 0.389


Top recommended categories in the training set:
    ['Drama', 'ForChildren', 'Action', 'Animation', 'Musical']
Top recommended categories in the testing set:
    ['Drama', 'Animation', 'ForChildren', 'Comedy', 'Romance']

Mean Average Precision @K=5: 0.224


Top recommended categories in the training set:
    ['Drama', 'ForChildren', 'Action', 'Animation', 'Musical', 'Fantasy', 'Sci-Fi']
Top recommended categories in the testing se