In [1]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import LeaveOneOut, train_test_split
from collections import defaultdict
import os
import pandas as pd
import numpy as np
import heapq
import tensorflow as tf
df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis = 1)
df

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


In [2]:
def getTopN(predictions, n = 10, minimumRating=4.0):
    topN = defaultdict(list)
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[userID].append((movieID, estimatedRating))
    for userID, ratings in topN.items():
        topN[userID] = heapq.nlargest(n, ratings, key=lambda x: x[1])
    return topN

def recommend_getTopN(predictions, uid, n = 10, minimumRating=4.0):
    topN = defaultdict(list)
    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if userID == uid and estimatedRating >= minimumRating:
            if movieID not in df[df['userId'] == uid].movieId.values:
                topN[movieID] = estimatedRating
    movies = []
    rating = []
    for movie, ratings in topN.items():
        movies.append(movie)
        rating.append(ratings)
    recoommand_movie = pd.DataFrame(rating, index = np.array(movies).astype(int), columns = ['ratings'])
    top_N = recoommand_movie.sort_values(by = 'ratings')[::-1][:n]
    return top_N

def HitRate(topN, leaveoutdata):
    hit = 0
    total = 0
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leaveoutdata:
        for movieID, predictedRating in topN[int(userID)]:
            if (int(leftOutMovieID) == movieID):
                hit += 1
                break
        total += 1
    print('hit times :',hit,' total number :',total)
    return hit / total

def evaluate(algo, n = 10):
    algo.fit(loocvTrain)
    leftOutPredictions = algo.test(loocvTest)
    allPredictions = algo.test(loocvAntiTestSet)
    topNPredicted = getTopN(allPredictions, n)
    hit_rate = HitRate(topNPredicted, leftOutPredictions)
    print('TOP 10 hit rate :', hit_rate)
    return hit_rate

def recommend(algo, uid, n = 10):
    algo.fit(trainSet)
    allPredictions = algo.test(testSet)
    top_N = recommend_getTopN(allPredictions, uid)
    return top_N

In [3]:
df.rating = df.rating * 2
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

#Evaluation
loocv = LeaveOneOut()
times = 1
func = SVD()

for train, test in loocv.split(data):
    print('leave times :', times)
    loocvTrain = train
    loocvTest = test
    loocvAntiTestSet = loocvTrain.build_anti_testset()
    evaluate(func)
    times += 1

leave times : 1
hit times : 22  total number : 671
TOP 10 hit rate : 0.03278688524590164
leave times : 2
hit times : 27  total number : 671
TOP 10 hit rate : 0.040238450074515646
leave times : 3
hit times : 28  total number : 671
TOP 10 hit rate : 0.041728763040238454
leave times : 4
hit times : 20  total number : 671
TOP 10 hit rate : 0.029806259314456036
leave times : 5
hit times : 14  total number : 671
TOP 10 hit rate : 0.020864381520119227


In [4]:
#Recommender system
trainSet = data.build_full_trainset()
testSet = trainSet.build_anti_testset()
uid = input('input user ID for recommending movies : ')
pred = recommend(func, int(uid))
print('Recommend movies for user {} :'.format(uid))
for movie in pred.index:
    print('Movie ID :', movie)

input user ID for recommending movies : 4
Recommend movies for user 4 :
Movie ID : 923
Movie ID : 4011
Movie ID : 7153
Movie ID : 1246
Movie ID : 1945
Movie ID : 4993
Movie ID : 4886
Movie ID : 318
Movie ID : 1228
Movie ID : 6539
