# Movielens Recommendation System

by Umur Türkay and Yasemin Alpay

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import scipy.linalg as la
import pandas as pd
import heapq as hp
import math


First we read the rating data using pandas library.

In [9]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] 
ratings = pd.read_table('data/ratings.dat', sep='::', header=None, names=rnames, engine='python')

ratings [:10]


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


## Algorithm 1 -Item Based k-NN

We used collaborative filtering and implemented item based k-NN algorithm to determine the recommendations. In order to implement the k-NN algorithm, we had to define a similarity function. 

The similarity functions that can be used in k-NN can vary, like Euclidean, Hamming, or Cosine similarity. We have chosen adjusted cosine similarity for our implementation. (Also can be found in: "Recommendation System Based on Collaborative Filtering", Chapter 3.1, by Zheng Wen, December 12, 2008).

Algorithm follows:

1-Each movie is passed to the similarity function. Similarity function need users and these users are selected from a set which the user u1 and user u2 voted for the movies that similarity function finds the similarity for.

2-Most similar k movies for each movie is found using the heap structure. 

3-We calculate weighted average to predict the rating that the user u gives to movie m. As the weight, we use the similarity values of the movie m and the most k similar movies to m
correspondingly.




Similarity function:

In [12]:
def sim(m1, m2):  #Movies to be compared 
    sum = 0
    diffSquare1 = 0
    diffSquare2 = 0
    M1 = A[A[:,1] == m1]
    M2 = A[A[:,1] == m2]
    commonUsers = np.intersect1d(M1[:,0], M2[:,0])
    for i in range(len(commonUsers)):
        userRatingMovie1 = np.array(ratings[(ratings.user_id == commonUsers[i]) & (ratings.movie_id == m1)]['rating'])[0]
        userRatingMovie2 = np.array(ratings[(ratings.user_id == commonUsers[i]) & (ratings.movie_id == m2)]['rating'])[0]
        U = A[A[:,0] == commonUsers[i]]
        userAvgRating = float(np.mean(U[:,2]))
        
        diffMov1 = float(userRatingMovie1 - userAvgRating)
        diffMov2 = float(userRatingMovie2 - userAvgRating)
        
        sum = float(sum + (diffMov1)*(diffMov2))
        
        diffSquare1 = diffSquare1 + pow(diffMov1,2)
        diffSquare2 = diffSquare2 + pow(diffMov2,2)
        
    sim = float(float(sum)/float(math.sqrt(diffSquare1*diffSquare2)))
    return sim

Heap:

In [13]:
# Use a heap to store the smallest items
# Define an object and overload custom comparison operators
class tup:
    def __init__(self, val, idx):
        self.val = val
        self.idx = idx
        
    def __lt__(self, other):
        '''Redefine for max-heap'''
        return self.val > other.val
    
    def __le__(self, other):
        return self.val <= other.val
 
    def __eq__(self, other):
        return self.val == other.val
    
    def __ne__(self, other):
        return self.val != other.val

    def __gt__(self, other):
        return self.val > other.val

    def __ge__(self, other):
        return self.val >= other.val

    def __str__(self):
        return '{:.3},{:d}'.format(self.val,self.idx)

In [8]:
user_id = 1 #user id to give recommendation

usersRatedMovies= np.array(ratings[(ratings.user_id == user_id)]['movie_id'])
allMovies = np.array(ratings['movie_id'])
usersUnRatedMovies = list(set(allMovies) - set(usersRatedMovies))

N = allMovies.shape[0]   
comparedMovie = 1 #initial value

for k in range(len(usersUnRatedMovies)):  #we only predict for the unrated movies.
    if(len(usersUnRatedMovies)>k):
        comparedMovie = usersUnRatedMovies[k]
        for m in range(N):
            simValue = sim(comparedMovie, allMovies[m])
            #simValue = s+m




11
****
1000209
661


## Algorithm 2 - Stochastic Gradient Descent

We implemented SGD as our second algorithm.

In [16]:
userM = np.matrix(ratings.user_id).T
movieM = np.matrix(ratings.movie_id).T
ratingM = np.matrix(ratings.rating).T

userLength = np.hstack(set(ratings['user_id']))[-1]
movieLength = np.hstack(set(ratings['movie_id']))[-1]

userStart = np.matrix([0]*(userLength+1)).T 
movieStart = np.matrix([0]*(movieLength+1)) 
Y = userStart*movieStart
M = Y
Ys = np.hstack([ratingM,userM,movieM])
Ysize = Ys.shape[0]

In [None]:
A = np.mat(np.random.rand(userLength+1, 1))
B = np.mat(np.random.rand(1, movieLength+1))

EPOCH = 5
Eta = 0.1
eta = Eta

for i in range(EPOCH):
    E = np.array(M)*np.array(Y - A*B)
    Err = np.sum(E*E)/np.sum(np.array(M))
        
    for k in range(Ysize):
        u = Ys[k,1]
        m = Ys[k,2]
        
        err = Ys[k,0] - A[u,:]*B[:,m]
        
        temp_A = A[u,:] + eta*err[0,0]*B[:,m].T
        B[:,m]   = B[:,m] + eta*err[0,0]*A[u,:].T
        A[u,:]   = temp_A
    
    eta = Eta*1./(i+1)

In [17]:
res = A*B
user_id = 6
userResult = [(0,0)]
for m in range (movieLength):
    movieTuple = (res[user_id,m],m)
    userResult.append(movieTuple)
userResult = sorted(userResult, key=lambda tup: tup[0], reverse=True)
print userResult
#print (userResult[user_id])

NameError: name 'A' is not defined