# Content-Based Recommender Evaluator

Below is the functions used to evaluate my metadata content-based recommender. The base code was pulled from: https://github.com/smalec/movielens and was adjusted to fit the data and was written in Python 2 so some lines needed to be updated

In [16]:
from random import shuffle

class Evaluator(object):
    def __init__(self, recommender):
        self.recommender = recommender
   
    def computeMAP(self, relevant_treshold=3.0, topN=5):
        k_cross = 5
        total_aps = 0.0
        total = 0
        users_ratings = self.recommender.dataset_handler.load_users_ratings()
        training_data = {user: user_ratings for user, user_ratings in users_ratings.items() if user < 0.8*len(users_ratings)}
        test_data = {user: user_ratings for user, user_ratings in users_ratings.items() if user not in training_data}
        self.recommender.train(training_data)
        for user_ratings in test_data.values():

                #Old code
#             user_items = user_ratings.items()
#             shuffle(user_items)
#             parts = [
#                 user_items[k*(len(user_items)/k_cross):(k+1)*(len(user_items)/k_cross) if k < k_cross-1 else len(user_items)]
#                 for k in range(k_cross)
#             ]

            #Karim
            keys = list(user_ratings.keys())
            shuffle(keys)
            user_items = [(k,user_ratings[k]) for k in keys]
            parts = [
                user_items[k*int(len(user_items)/k_cross):(k+1)*int(len(user_items)/k_cross) if k < k_cross-1 else len(user_items)]
                for k in range(k_cross)
            ]
           
           
            for i in range(k_cross):
                test, training = parts[i], [rat for part in parts[:i]+parts[i+1:] for rat in part]
                relevant = [movieId for (movieId, rating) in test if rating >= relevant_treshold]
                user_profile = self.recommender.create_user_profile(dict(training))
                predicted = self.recommender.top(user_profile, topN=topN)
                if relevant:
                    total_aps += self._computeAP(relevant, predicted)
                    total += 1
        return total_aps/total
   
    def computeRMSE(self):
        k_cross = 5
        rse = 0.0
        total = 0
        users_ratings = self.recommender.dataset_handler.load_users_ratings()
        training_data = {user: user_ratings for user, user_ratings in users_ratings.items() if user < 0.8*len(users_ratings)}
        test_data = {user: user_ratings for user, user_ratings in users_ratings.items() if user not in training_data}
        self.recommender.train(training_data)
        for user_ratings in test_data.values():
                #Old code
#             user_items = user_ratings.items()
#             shuffle(user_items)
#             parts = [
#                 user_items[k*(len(user_items)/k_cross):(k+1)*(len(user_items)/k_cross) if k < k_cross-1 else len(user_items)]
#                 for k in range(k_cross)
#             ]

            #Karim
            keys = list(user_ratings.keys())
            shuffle(keys)
            user_items = [(k,user_ratings[k]) for k in keys]
            parts = [
                user_items[k*int(len(user_items)/k_cross):(k+1)*int(len(user_items)/k_cross) if k < k_cross-1 else len(user_items)]
                for k in range(k_cross)
            ]
            for i in range(k_cross):
                test, training = parts[i], [rat for part in parts[:i]+parts[i+1:] for rat in part]
                user_profile = self.recommender.create_user_profile(dict(training))
                for (movieId, rating) in test:
                    predicted = self.recommender.predict_rating(user_profile, movieId)
                    if predicted > 0:
                        rse += (rating - predicted)**2
                        total += 1
        return rse/total
   
    def _computeAP(self, relevant, predicted):
        ap = 0.0
        good_predictions = 0.0
        for i, item in enumerate(predicted):
            if item in relevant:
                good_predictions += 1
                ap += 1.0/(i+1) * good_predictions/(i+1)
        return ap

In [17]:
import numpy as np
import os
import pandas as pd

genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western"
]

class DatasetHandler(object):
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
    
    def ids2titles(self, ids):
        return [self.id_to_title[movieId] for movieId in ids]
    
    def indices2ids(self, indices):
        return [self.movie_index_to_movie_id[index] for index in indices]
    
    def id2index(self, movieId):
        return self.movie_index_to_movie_id.index(movieId)
    
    def movie_vector2genres(self, movie_vector):
        return [self.feature_index2genre(i) for i, x in enumerate(movie_vector) if x == 1]
    
    def feature_index2genre(self, feature_index):
        return genres[feature_index]

    def load_movies(self):
        movies_frame = pd.read_csv(os.path.join(self.dataset_path, "movies1.csv"), sep=",", engine="python")
        self.id_to_title = {}
        self.movie_index_to_movie_id = []
        movies_vectors = []
        for _, row in movies_frame.iterrows():
            genres_list = row["genres"].split("|")
            self.id_to_title[int(row["movieId"])] = row["title"]
            self.movie_index_to_movie_id.append(int(row["movieId"]))
            movies_vectors.append(np.array([1 if genre in genres_list else 0 for genre in genres]))
        return np.array(movies_vectors)

    def load_users_ratings(self):
        ratings_frame = pd.read_csv(os.path.join(self.dataset_path, "ratings1.csv"), sep=",", engine="python")
        users_ratings = {}
        for _, row in ratings_frame.iterrows():
            if int(row["userId"]) not in users_ratings:
                users_ratings[int(row["userId"])] = {}
            users_ratings[int(row["userId"])][int(row["movieId"])] = row["rating"]
        return users_ratings

In [18]:
%pylab inline

import numpy as np
from sklearn.neighbors import NearestNeighbors

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [19]:
# dataset100k = "./m-1MB/"
dataset100k = "/Users/connorranson/Downloads/"

In [20]:
dataset_handler = DatasetHandler(dataset100k)
user_ratings = dataset_handler.load_users_ratings()

In [21]:
class ContentBasedRecommender(object):
    def __init__(self, dataset_handler):
        self.dataset_handler = dataset_handler
        self.movies_vectors = self.dataset_handler.load_movies()
    
    def train(self, train_set):
        pass
    
    def top(self, user_profile, topN):
        return self._cosineKNN_all_movies(user_profile[0], topN)
    
    def predict_rating(self, user_profile, movieId):
        nearest_watched_movies = self._cosineKNN_movies_subset(user_profile[1].keys(), movieId, 5)
        return np.average(np.array([user_profile[1][movie] for movie in nearest_watched_movies]))
        
    def create_user_profile(self, user_ratings):
        return (
            np.average(
                np.array([
                    self.movies_vectors[self.dataset_handler.id2index(movie)]
                    for (movie, rating) in user_ratings.items()
                ]),
                #Old code
#                 weights=np.array(user_ratings.values()),
                #By Karim
                weights=np.array(list(user_ratings.values())),
                axis=0
            ),
            user_ratings
        )
    
    def present_user_profile(self, user_profile):
        print ("User favourite genre:", self.dataset_handler.feature_index2genre(np.argmax(user_profile[0])))
        print ("User ratings:")
        for (movieId, rating) in user_profile[1].items():
            movie_vector = self.movies_vectors[self.dataset_handler.id2index(movieId)]
            #By Karim
            print (self.dataset_handler.id_to_title[movieId], ' ',  \
                   self.dataset_handler.movie_vector2genres(movie_vector), ':',rating)
            #Old code
#             print ("{} {}: {}").format(
#                 self.dataset_handler.id_to_title[movieId],
#                 self.dataset_handler.movie_vector2genres(movie_vector),
#                 rating
#             )
    
    def present_recommendations(self, recommendations):
        print ("Recommended movies:")
        for movieId in recommendations:
            movie_vector = self.movies_vectors[self.dataset_handler.id2index(movieId)]
            #By Karim
            print(self.dataset_handler.id_to_title[movieId], self.dataset_handler.movie_vector2genres(movie_vector))
            #Old code
#             print ("{} {}").format(
#                 self.dataset_handler.id_to_title[movieId],
#                 self.dataset_handler.movie_vector2genres(movie_vector)
#             )
    
    def _cosineKNN_all_movies(self, user_profile, k):
        nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
        nbrs.fit(self.movies_vectors)
        return self.dataset_handler.indices2ids(nbrs.kneighbors(np.array([user_profile]), k, return_distance=False)[0])
    
    def _cosineKNN_movies_subset(self, movies_subset, movieId, k):
        nbrs = NearestNeighbors(k, metric='cosine', algorithm='brute')
        movies_with_ids = np.array([
            np.hstack([[watched_movie], self.movies_vectors[self.dataset_handler.id2index(watched_movie)]])
            for watched_movie in movies_subset
        ])
        nbrs.fit(movies_with_ids[:, 1:])
        return movies_with_ids[
            nbrs.kneighbors(
                np.array([self.movies_vectors[self.dataset_handler.id2index(movieId)]]), return_distance=False
            )[0],
            0
        ]

In [22]:
recommender = ContentBasedRecommender(dataset_handler)

In [23]:
user_profile = recommender.create_user_profile(user_ratings[1])

In [24]:
recommender.present_user_profile(user_profile)

User favourite genre: Action
User ratings:
Toy Story (1995)   ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'] : 4.0
Grumpier Old Men (1995)   ['Comedy', 'Romance'] : 4.0
Heat (1995)   ['Action', 'Crime', 'Thriller'] : 4.0
Seven (a.k.a. Se7en) (1995)   ['Mystery', 'Thriller'] : 5.0
Usual Suspects, The (1995)   ['Crime', 'Mystery', 'Thriller'] : 5.0
From Dusk Till Dawn (1996)   ['Action', 'Comedy', 'Horror', 'Thriller'] : 3.0
Bottle Rocket (1996)   ['Adventure', 'Comedy', 'Crime', 'Romance'] : 5.0
Braveheart (1995)   ['Action', 'Drama', 'War'] : 4.0
Rob Roy (1995)   ['Action', 'Drama', 'Romance', 'War'] : 5.0
Canadian Bacon (1995)   ['Comedy', 'War'] : 5.0
Desperado (1995)   ['Action', 'Romance', 'Western'] : 5.0
Billy Madison (1995)   ['Comedy'] : 5.0
Clerks (1994)   ['Comedy'] : 3.0
Dumb & Dumber (Dumb and Dumber) (1994)   ['Adventure', 'Comedy'] : 5.0
Ed Wood (1994)   ['Comedy', 'Drama'] : 4.0
Star Wars: Episode IV - A New Hope (1977)   ['Action', 'Adventure', 'Sci-Fi'] : 

In [25]:
top = recommender.top(user_profile, topN=5)

In [26]:
recommender.present_recommendations(top)

Recommended movies:
Dragonheart 2: A New Beginning (2000) ['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Thriller']
Hunting Party, The (2007) ['Action', 'Adventure', 'Comedy', 'Drama', 'Thriller']
Flashback (1990) ['Action', 'Adventure', 'Comedy', 'Crime', 'Drama']
The Great Train Robbery (1978) ['Action', 'Adventure', 'Comedy', 'Crime', 'Drama']
Stunt Man, The (1980) ['Action', 'Adventure', 'Comedy', 'Drama', 'Romance', 'Thriller']


In [27]:
evaluator = Evaluator(ContentBasedRecommender(dataset_handler))
evaluator.computeMAP()

0.009918887076309588

In [28]:
evaluator = Evaluator(ContentBasedRecommender(dataset_handler))
evaluator.computeRMSE()

0.9479311265350095