In [2]:
import nbimporter
from surprise import AlgoBase
from surprise import PredictionImpossible
from MovieLens import MovieLens
import math
import heapq
import numpy as np

Importing Jupyter notebook from MovieLens.ipynb


In [3]:
class ContentKNNAlgorithm(AlgoBase):
    
    def __init__(self,k=40,sim_options={}):
        AlgoBase.__init__(self)
        self.k = k
    
    def fit(self,trainset):
        AlgoBase.fit(self,trainset)
        ml = MovieLens()
        genres = ml.getGenres()
        years = ml.getYears()
        
        print("Computing content-based similarity matrix...")
        self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
        
        for currRating in range(self.trainset.n_items):
            if(currRating%1000 == 0):
                print(currRating, " of ", self.trainset.n_items)
            for secRating in range(currRating+1,self.trainset.n_items):
                currMovieID = int(self.trainset.to_raw_iid(currRating))
                secMovieID = int(self.trainset.to_raw_iid(secRating))
                genreSimilarity = self.computeGenreSimilarity(currMovieID, secMovieID, genres)
                yearSimilarity = self.computeYearSimilarity(currMovieID, secMovieID, years)
                
                self.similarities[currRating, secRating] = genreSimilarity * yearSimilarity
                self.similarities[secRating, currRating]= self.similarities[currRating, secRating]
        
        print("...done")
        return self
    
    def computeGenreSimilarity(self, movie1, movie2, genres):
        genres1 = genres[movie1]
        genres2 = genres[movie2]
        sumxx, sumxy, sumyy = 0, 0, 0
        for i in range(len(genres1)):
            x = genres1[i]
            y = genres2[i]
            sumxx += x * x
            sumyy += y * y
            sumxy += x * y
        
        return sumxy/math.sqrt(sumxx*sumyy)
    
    def computeYearSimilarity(self, movie1, movie2, years):
        diff = abs(years[movie1] - years[movie2])
        sim = math.exp(-diff / 10.0)
        return sim
    
    def estimate(self,user,item):
        if not (self.trainset.knows_user(user) and self.trainset.knows_item(item)):
            raise PredictionImpossible('User and/or item is unkown.')
        
        # Build up similarity scores between this item and everything the user rated
        neighbors = []
        for rating in self.trainset.ur[user]:
            genreSimilarity = self.similarities[item,rating[0]]
            neighbors.append( (genreSimilarity, rating[1]))
        
        kNeighbors = heapq.nlargest(self.k,neighbors,key = lambda x:x[0])
        simTotal = weightedSum = 0
        for (simScore, rating) in kNeighbors:
            if (simScore > 0):
                simTotal += simScore
                weightedSum += simScore * rating
        
        if (simTotal == 0):
            raise PredictionImpossible('No neighbors')

        predictedRating = weightedSum/simTotal

        return predictedRating            